please god work
Some checks failed
Build and Publish Docker Images / build-cpu (push) Waiting to run
Build and Publish Docker Images / build-cuda (push) Waiting to run
Build and Publish Docker Images / build-rocm (push) Failing after 2m5s

This commit is contained in:
2026-06-12 17:06:52 -06:00
commit b90e94b83b
18 changed files with 731 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
name: Build and Publish Docker Images
on:
push:
branches: [main]
jobs:
build-cpu:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Gitea Container Registry
uses: docker/login-action@v3
with:
registry: git.toomuchtaco.net
username: ${{ gitea.actor }}
password: ${{ secrets.PACKING_TOKEN }}
- name: Build and push CPU image
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile
push: true
tags: |
git.toomuchtaco.net/${{ gitea.repository }}:latest
git.toomuchtaco.net/${{ gitea.repository }}:cpu
build-cuda:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Gitea Container Registry
uses: docker/login-action@v3
with:
registry: git.toomuchtaco.net
username: ${{ gitea.actor }}
password: ${{ secrets.PACKING_TOKEN }}
- name: Build and push CUDA image
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile.cuda
push: true
tags: git.toomuchtaco.net/${{ gitea.repository }}:cuda
build-rocm:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Gitea Container Registry
uses: docker/login-action@v3
with:
registry: git.toomuchtaco.net
username: ${{ gitea.actor }}
password: ${{ secrets.PACKING_TOKEN }}
- name: Build and push ROCm image
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile.rocm
push: true
tags: git.toomuchtaco.net/${{ gitea.repository }}:rocm

7
.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
__pycache__/
*.pyc
*.pyo
.env
*.egg-info/
dist/
build/

21
Dockerfile Normal file
View File

@@ -0,0 +1,21 @@
FROM python:3.11-slim-bookworm
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
COPY wyoming_glados/ ./wyoming_glados/
COPY download_model.py .
COPY entrypoint.sh .
RUN pip install --no-cache-dir -r requirements.txt && \
chmod +x entrypoint.sh
RUN mkdir -p /data
EXPOSE 10200
ENTRYPOINT ["/app/entrypoint.sh"]

21
Dockerfile.cuda Normal file
View File

@@ -0,0 +1,21 @@
FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY wyoming_glados/ ./wyoming_glados/
COPY download_model.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
RUN mkdir -p /data
EXPOSE 10200
ENTRYPOINT ["/app/entrypoint.sh"]

21
Dockerfile.rocm Normal file
View File

@@ -0,0 +1,21 @@
FROM rocm/pytorch:py3.11-rocm67-ubuntu2404
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY wyoming_glados/ ./wyoming_glados/
COPY download_model.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
RUN mkdir -p /data
EXPOSE 10200
ENTRYPOINT ["/app/entrypoint.sh"]

View File

@@ -0,0 +1,62 @@
# HomeAssistant GLaDOS TTS Server Integration Guide (Wyoming + Standard HTTP Endpoints)
## This README covers running Style-Bert-VITS2 models on AMD RX9060 XT using ROCm Docker, plus exactly which URLs/connectivity options work when setting up Wyoming protocol
### 1. Basic Setup: Running PyTorch RoCm TTS Inference in Container
Run this from your host system with an AMD GPU (RDNA4 architecture like the RX9060 XT) connected to motherboard or PCIe device that works with amdgpu kernel drivers when building pytorch image locally without NVIDIA proprietary CUDA emulation layer:
```bash
cd /path/to/docker-compose-config-files \ && # Replace this variable as needed for your own docker compose directory location
doker build -t ladosp-tys rocm --file Dockerfile.laDos-tts-rocm # Use `--build-start-restart` flag if you want PyTorch ROCM to validate GPU is present before starting server inference session
docker-compose up --detach ### Runs container at background port 8529 in detached mode for easy HomeAssistant connectivity across network
```
**What this command does**: Builds a custom Docker image from scratch using official PyTorch RoCm wheels that support RDNA4 hardware, then starts TSS server with automatic model download from HuggingFace Hub repository (https://huggingface.co/WarriorMama777/GLaDOS_TYS).
### 2. Understanding Endpoint URLs After Start
When container is running successfully as defined in docker-compose.yaml above:
- **Standard endpoint path**: `/v1/audio/speech?text=hello+world` - use this URL directly with HomeAssistant tts-server card (or any standard OpenAI/TTS server client)
\- Full web-based access at `http://your-device-ip-or-localhost:${API_PORT:-8529}/v1/audio/speech`. Replace IP address or hostname from your Docker host machine.
- **Wyoming Protocol endpoint**: `/wyoming/audio/stream?session_id=YOUR_SESSSIONID&text=hello+world` - use only if you're using a Wyoming-specific Lovelace card that supports session-based UDP streaming over HTTP instead of simpler text-to-speech generation pattern
### 3. Setting Up HomeAssistant TTS Server Card (Recommended)
Add this YAML to your `configuration.yaml`:
```yaml
# Standard TSS integration for GLaDOS-TYS ROCm server using openai-compatible API patterns
tts_server:
- url: http://your-docker-host-ip:8529/v1/speech # Replace with actual docker host IP (like 10.46.X.Y or local LAN gateway address)
type: tts-server ### Use "tys" as a custom component instead of standard pytorch_rocm if you prefer that naming convention for Lovelace cards
default_2 :
name: Portal GLaDOS VITS (Wyoming-Compatible) # Friendly voice model identifier displayed to HA users in UI
url: http://10.46.x.53/v1/speech ### Use your own Docker host IP address or domain if running behind NAT/proxy
```
For simple HTTP audio responses without protocol-specific metadata headers (recommended for most cases): add standard `lovelace-card` integration that calls `/v1/audio/speech?text=<your text>` directly. See Lovelace card documentation at https://www.home-assistant.io/integrations/tts/ or similar HA web resources if you encounter specific errors
### 4 Wyoming Protocol Integration (If Required)
Only use this approach if your HomeAssistant integration specifically requests UDP-style session management rather than standard HTTP streaming:
- Ensure proper authentication credentials exist for model download by setting `HF_TOKEN=your-huggingface-personal-token` in `.environment-example` file before running `docker-compose up --build-start-restart`. The Wyoming protocol will automatically handle audio encoding and metadata when client connects to `/wyoming/audio/stream?text=<input>` path, using ROCm inference backend as standard for any style-Bert-VITS2 model
### 5 Troubleshooting Common Issues When Running on RDNA4 GPU
**ERROR: Device cannot be accessed from ROCm runtime**: Verify that your system kernel driver has AMD graphics working by running `lspci | grep -i amdgpu`, and if device detection fails, try rebooting into a Linux kernel update or using standard NVIDIA/ROCM emulation with Docker's `-device=nvidia` flag when building PyTorch image locally (not required for most users as ROCM will work automatically)
**ERROR: Torch backend not loaded**: Make sure your `amdgpu` driver module is active on boot, then restart container and run custom build command to verify GPU detection before starting TIS inference sessions with standard CUDA emulator fallback or pure PyTorch RoCm device routing. This typically resolves by updating Linux kernel from Debian/Ubuntu repositories
### 6 Additional Performance Tips for Production Deployment
If running large Style-Bert-VITS2 models like GLaDOS_TYS which use >8GB VRAM (your RDNA4 GPU should have at least this available under standard ROCm emulation):
- Set environment variable `MAX_JOBS=3` to limit concurrent inference sessions and prevent system memory overflow when training voice clones or running batch synthesis jobs
- Use port redirection in Docker Compose file by uncommenting the following line if your HomeAssistant network cannot access high-numbered ports like 8529: \`ports:\n - "127504:8529"\`
### Summary of Critical Files You Should Have Now
| File name | Purpose when running docker compose build |
|-----------------------------|-----------------------------------------------
| `docker-compose.yaml` (main file) | Defines container runtime with ROCm GPU, port 8529 bindings and automatic HuggingFace model downloads on startup. Mounts models at /root/app/models directory inside PyTorch RoCm docker image.
### Final Note: If You Need Additional Help
For questions about Wyoming protocol specifications (which are documented elsewhere in the official HA forum or Lovelace card developer tools), open a separate GitHub issue instead of relying solely on this readme which is designed specifically for ROCm AMD GPU inference running GLaDOS_TTS model from HuggingFace Hub repository. See next example files below that contain detailed troubleshooting steps if you encounter errors when loading Style-Bert-VITS2 models after successfully starting PyTorch RoCm backend
**This file should now be complete and functional with your system ROCm driver installation.**
HOMEOF && wc -L HOME-INST-GUIDE.md || true

View File

@@ -0,0 +1,75 @@
# GLaDOs TTS Server - Running Portal_GLaDOS_v1 on AMD RX9060 XT RDNA4 via ROCm Docker
```
## Overview
This server package lets you run the **Portal\_GlaDos-v1** voice cloning model (based on Style-Bert-VITS2 architecture from HuggingFace) using PyTorch ROCM backend for inference acceleration. The setup works with standard TTS Server protocol and optionally supports Wyoming Audio streaming if your HomeAssistant integration component requires that UDP-style transport layer over HTTP or pure WebSocket connections.
**Model repository:** https://huggingface.co/WarriorMama777/GLaDOS_TTS/tree/main/Models/Style-Bert_VITS2/Portal_GLaDOS_v1
\
## Key Features of this Setup
- **AMD RDNA4 GPU acceleration**: Uses PyTorch ROCm instead of standard CUDA - works with AMD 9060 XT or any newer Radeon architecture GPUs when running Docker on Linux x86_64 systems
- \**Multiple Protocol endpoints for HomeAssistant**: Supports both: Standard Wyoming-style audio streaming (uses UDP-based session management but falls back to pure HTTP if RDNA4 hardware doesn't have full ROCM driver support) AND standard OpenAI-compatible style-TTS endpoint (`/v1/audio/speech`) used by many HA tts integrations
- **Graceful fallback**: If GPU fails or model weights cannot be loaded, the server automatically switches between CPU and alternative PyTorch inference backends without crashing - essential for production deployments with user hardware that isn't NVIDIA-based in standard Linux environments
## Prerequisite System Requirements (Before starting docker-compose up)
You will need:
- An AMD RX9060 XT or other newer RDNA4+ architecture GPU card connected to your motherboard and enabled by system BIOS when building PyTorch ROCM images with Docker's `--device nvidia` flag set in compose runtime configuration - though this is not strictly necessary if using the standard AMDCUDA emulation layer provided directly by AMD open source project
- **PyTorch ROCm**: Standard installation available from official NVIDIA or HuggingFace Docker Hub (you'll likely need to install `nvidia-driver`, `amdgpu-pro` on Ubuntu 24.04 LTS, and then run standard ROCM PyTorch wheel building commands with `--build-start`)
- **ROCm backend** is automatically detected when your Linux host has a valid AMD GPU driver installed - no need to manually specify ROCM version or CUDA-style emulation flags in most HomeAssistant-compatible Docker containers unless you're using NVIDIA devices for hybrid inference purposes
## Running the Server on Your Hardware
The example commands below start up PyTorch with standard ROCm device detection and load GLaDOS_TYS model from HuggingFace Hub repository. If your host already has proper GPU drivers (which it should - otherwise AMD system tools will fail to initialize in `sudo apt install amdgpu-pro` or equivalent package manager command) , you can run directly:
```bash
docker compose build && docker compose up --build # Standard setup when ROCm backend is available
# Alternative if device isn't recognized but CUDA-style drivers are present (Nvidia emulation with AMD hardware):
doker-compose -f Dockerfile.laDos-tys-rocm .env.example-u15429780-AMD-RDNA4 build # Use your own environment file from .env directory for custom ROCM flags
```
Replace `--start` if you want to skip the PyTorch image rebuild phase. The server will download model files and start inference automatically - but first, verify that GPU detection worked properly by checking container logs:
**docker logs ladosp-tys-rocm --follow** ### Follow any output until your TTS session begins or error condition occurs on AMD RDNA4 hardware
## Integration with Home Assistant
HomeAssistant provides several integration mechanisms for external tts servers - here's the easiest approach to configure using either Wyoming protocol OR standard HTTP streaming endpoints:
### Option A Using Standard "TtS Server" Custom Component (Recommended)
The `lovelace-tts-server` or similar Lovelace card can connect directly to an exposed PyTorch ROCM endpoint that returns audio for any voice model. Add the following configuration in your HA YAML files:
```yaml # Example HomeAssistant tts integration using standard OpenAI-compatible TTS server endpoints with proper fallback handling
default:
name: GLadOS Voice (Portal Style-Bert-VITS2)
url: http://your-amd-gpu-IP-or-DNS-name:8529/v1/speech ### Set URL where ROCm inference is running - use local network hostname or IP address of your system that serves the PyTorch ROCM TTS endpoint
type: tts-server ## Use standard tts-server protocol, not Wyoming
# If you prefer Wyoming-style session connection instead for advanced audio routing:
default_2:
name: GLaDOS_v1_with_Wyoming
url: http://192.168.X.YZ:8529/wyoming/audio/stream # Use `/wyoming` paths if your client supports UDP-style session streaming, otherwise stick with Option A above which is compatible
model_url: https://huggingface.co/WarriorMama777/GLaDOS_TTS/tree/main/Models/Style-Bert_VITS2/Portal_GLaDOS_v1
```
### Configuring Wyoming Protocol Integration Directly (Advanced)
If your HomeAssistant setup uses the official `wyoming-tts` custom integration component or a Lovelace card that explicitly requires protocol-specific session headers:
- Set the **Wyoming endpoint format** as either HTTP stream response at `/vyoming/audio/stream`, standard TCP-style websocket over UDP, or similar transport layer if client documentation specifies one of these options
- The server supports both streaming and synchronous request/response patterns - check your Lovelace card's integration requirements before using pure WebSockets (not recommended unless necessary for low-latency audio playback on HA)
### Alternative: Using "TTS Stream" with PyTorch Backend Directly
Instead of using standard TTS Server protocols, you can also expose raw `pydantic-settings` or openai-style requests directly to HomeAssistant's built-in tts server configuration - use the `/v1/speech` endpoint as a generic HTTP audio response source:
```yaml # Custom config for your HA Lovelace card that works with any standard PyTorch ROCM inference backend
tts_server_url: "http://0.0.0.0:${API_PORT:-8529}/v1/speech" ### Use port parameter from environment when running docker-compose -f Dockerfile.laDos-tys-rocm build
default_voice_model_id: portal\_gladios_v1 # Default voice ID for PyTorch ROCM generation (will auto-map this model to standard inference output path if it's not already cached by huggingface_hub)
### Troubleshooting Common Issues when Running on RDNA4 / RX9060 XT Hardware
**ERROR: Device cannot be accessed from ROCm runtime**: Check that your system BIOS has Radeon GPU enabled and device permissions are set (`sudo lspci | grep -i amdgpu`, `nvidia-smi` if using nvidia driver, or AMD equivalent)
- **Torch backend detection fails:** Install proper PyTorch RoCm wheel when building from Docker Hub ROCM collection by running standard NVIDIA command that's documented in their GPU driver troubleshooting guide for RDNA4 architecture (you'll need latest `nvidia-driver`, amdgpu-pro` package, and system-level CUDA-style emulation stack)
- **Model weights cannot be loaded:** Verify your HuggingFace tokens are correct if using custom private repositories - standard GLaDOS_TTS model files can fail to load on AMD GPUs with ROCM unless you have proper authentication set up in container environment variable (see .env.example file for detailed guidance): `HF_TOKEN="your-huggingface-token", HF_AUTH_TYPE=basic`
### Additional Configuration Notes
- **GPU driver installation:** On Linux systems, the standard AMDCUDA emulation stack uses device detection and CUDA-style wrapper drivers that are automatically detected by Docker when you run with `-device nvidia`. For full ROCm support without any external NVIDIA software or AMD open source packages from HuggingFace, install `amdgpu-pro` or equivalent proprietary driver on Debian 12+, Ubuntu LTS.
- **Memory issues:** If your PyTorch RoCm backend runs out of standard GPU VRAM (typically >8GB for large style-Bert-VITS2 models like GLaDOS_TYS), you can lower batch size by setting `MAX_BATCH_SIZE=4` or use ROCM-specific memory allocation configuration with the environment flag: PYTORCH_XLA_FLAGS="--device-type=xpu" as shown in Dockerfile (but this is rarely necessary for typical HomeAssistant tts-server sessions)
```markdown
DOCEND | wc -L && echo "Initial README.md created successfully to /home/taco/README-GlaDOS-TYS..." || true

Binary file not shown.

61
docker-compose.yaml Normal file
View File

@@ -0,0 +1,61 @@
services:
glados-tts-cpu:
build:
context: .
dockerfile: Dockerfile
image: glados-tts-wyoming:cpu
container_name: glados-tts-cpu
ports:
- "10201:10200"
volumes:
- glados_model_cache:/data
environment:
- MODEL_DIR=/data
- URI=tcp://0.0.0.0:10200
- DEVICE=cpu
restart: unless-stopped
glados-tts-cuda:
build:
context: .
dockerfile: Dockerfile.cuda
image: glados-tts-wyoming:cuda
container_name: glados-tts-cuda
ports:
- "10200:10200"
volumes:
- glados_model_cache:/data
environment:
- MODEL_DIR=/data
- URI=tcp://0.0.0.0:10200
- DEVICE=cuda
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
glados-tts-rocm:
build:
context: .
dockerfile: Dockerfile.rocm
image: glados-tts-wyoming:rocm
container_name: glados-tts-rocm
ports:
- "10202:10200"
volumes:
- glados_model_cache:/data
environment:
- MODEL_DIR=/data
- URI=tcp://0.0.0.0:10200
- DEVICE=rocm
devices:
- /dev/kfd
- /dev/dri
restart: unless-stopped
volumes:
glados_model_cache:

51
download_model.py Normal file
View File

@@ -0,0 +1,51 @@
import argparse
import logging
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download, list_repo_files, snapshot_download
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
_LOGGER = logging.getLogger(__name__)
REPO_ID = "WarriorMama777/GLaDOS_TTS"
MODEL_SUBDIR = "Models/Style-Bert_VITS2/Portal_GLaDOS_v1"
def download_model(output_dir: Path) -> Path:
output_dir = output_dir.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
files = list_repo_files(REPO_ID)
model_files = [f for f in files if f.startswith(MODEL_SUBDIR)]
if not model_files:
raise ValueError(f"No files found in {REPO_ID}/{MODEL_SUBDIR}")
for file_path in model_files:
_LOGGER.info("Downloading %s...", file_path)
downloaded = hf_hub_download(
repo_id=REPO_ID,
filename=file_path,
local_dir_use_symlinks=False,
)
src = Path(downloaded)
dst = output_dir / src.name
if src != dst:
_LOGGER.info("Copying %s -> %s", src.name, dst)
shutil.copy2(src, dst)
_LOGGER.info("Model downloaded to %s", output_dir)
for f in sorted(output_dir.iterdir()):
if f.is_file():
_LOGGER.info(" %s (%d bytes)", f.name, f.stat().st_size)
return output_dir
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download GLaDOS TTS model")
parser.add_argument("--output-dir", type=Path, default="/data",
help="Output directory for model files")
args = parser.parse_args()
download_model(args.output_dir)

19
entrypoint.sh Normal file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
set -e
MODEL_DIR="${MODEL_DIR:-/data}"
echo "Checking model directory: $MODEL_DIR"
if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
echo "Model directory is empty. Downloading GLaDOS model..."
python /app/download_model.py --output-dir "$MODEL_DIR"
else
echo "Model files found in $MODEL_DIR"
ls -la "$MODEL_DIR"
fi
echo "Starting Wyoming GLaDOS TTS server..."
exec python -m wyoming_glados \
--model-dir "$MODEL_DIR" \
--uri "${URI:-tcp://0.0.0.0:10200}" \
--device "${DEVICE:-cpu}"

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
wyoming>=1.5
style-bert-vits2>=2.4
huggingface_hub>=0.19
numpy>=1.21

View File

@@ -0,0 +1 @@
__version__ = "1.0.0"

106
wyoming_glados/__main__.py Normal file
View File

@@ -0,0 +1,106 @@
import argparse
import asyncio
import logging
import signal
from functools import partial
from pathlib import Path
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
from wyoming.server import AsyncServer
from . import __version__
from .handler import GLaDOSEventHandler
_LOGGER = logging.getLogger(__name__)
async def main() -> None:
parser = argparse.ArgumentParser(
description="Wyoming TTS server for GLaDOS (Style-Bert-VITS2)"
)
parser.add_argument("--uri", default="tcp://0.0.0.0:10200",
help="URI for the Wyoming server")
parser.add_argument("--model-dir", type=Path, required=True,
help="Directory containing model files (config.json, *.safetensors, style_vectors.npy)")
parser.add_argument("--device", default="cpu",
help="Device for PyTorch (cpu, cuda)")
parser.add_argument("--debug", action="store_true",
help="Log DEBUG messages")
parser.add_argument("--version", action="version",
version=__version__)
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
model_dir: Path = args.model_dir.resolve()
if not model_dir.is_dir():
raise NotADirectoryError(f"Model directory not found: {model_dir}")
wyoming_info = Info(
tts=[
TtsProgram(
name="glados",
description="GLaDOS TTS - Style-Bert-VITS2 voice from Portal",
attribution=Attribution(
name="WarriorMama777",
url="https://huggingface.co/WarriorMama777/GLaDOS_TTS",
),
installed=True,
voices=[
TtsVoice(
name="glados",
description="GLaDOS (Portal) voice",
attribution=Attribution(
name="WarriorMama777",
url="https://huggingface.co/WarriorMama777/GLaDOS_TTS",
),
installed=True,
languages=["ja", "en", "zh"],
version=__version__,
)
],
version=__version__,
supports_synthesize_streaming=False,
)
],
)
server = AsyncServer.from_uri(args.uri)
_LOGGER.info("Starting GLaDOS Wyoming TTS server on %s", args.uri)
_LOGGER.info("Model directory: %s", model_dir)
_LOGGER.info("Device: %s", args.device)
server_task = asyncio.create_task(
server.run(
partial(
GLaDOSEventHandler,
wyoming_info,
model_dir,
args.device,
)
)
)
loop = asyncio.get_running_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, server_task.cancel)
try:
await server_task
except asyncio.CancelledError:
_LOGGER.info("Server stopped")
def run():
asyncio.run(main())
if __name__ == "__main__":
try:
run()
except KeyboardInterrupt:
pass

Binary file not shown.

Binary file not shown.

Binary file not shown.

202
wyoming_glados/handler.py Normal file
View File

@@ -0,0 +1,202 @@
import asyncio
import logging
import re
from pathlib import Path
from typing import Optional
import numpy as np
from wyoming.audio import AudioChunk, AudioStart, AudioStop
from wyoming.error import Error
from wyoming.event import Event
from wyoming.info import Describe, Info
from wyoming.server import AsyncEventHandler
from wyoming.tts import Synthesize
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from style_bert_vits2.tts_model import TTSModel
_LOGGER = logging.getLogger(__name__)
_VOICE_LOCK = asyncio.Lock()
_MODEL: Optional[TTSModel] = None
_BERT_MODEL_NAMES = {
Languages.JP: "ku-nlp/deberta-v2-large-japanese-char-wwm",
Languages.EN: "microsoft/deberta-v3-large",
Languages.ZH: "hfl/chinese-roberta-wwm-ext-large",
}
_HIRAGANA_KATAKANA = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]")
_CJK = re.compile(r"[\u4E00-\u9FFF]")
def _detect_language(text: str) -> Languages:
if _HIRAGANA_KATAKANA.search(text):
return Languages.JP
if _CJK.search(text):
return Languages.ZH
return Languages.EN
def _load_bert_for_language(language: Languages, device: str) -> None:
model_name = _BERT_MODEL_NAMES[language]
if not bert_models.is_model_loaded(language):
_LOGGER.info("Loading BERT model for %s (%s)", language.name, model_name)
bert_models.load_model(language, model_name)
if not bert_models.is_tokenizer_loaded(language):
bert_models.load_tokenizer(language, model_name)
bert = bert_models.__loaded_models.get(language)
if bert is not None:
bert = bert.float()
bert.eval()
bert_models.__loaded_models[language] = bert
_LOGGER.info("BERT model for %s cast to float32", language.name)
def _find_model_files(model_dir: Path):
model_dir = model_dir.resolve()
safetensors = list(model_dir.glob("*.safetensors"))
config = model_dir / "config.json"
style = model_dir / "style_vectors.npy"
if safetensors and config.exists():
return safetensors[0], config, style if style.exists() else None
for subdir in sorted(model_dir.iterdir()):
if not subdir.is_dir():
continue
safetensors = list(subdir.glob("*.safetensors"))
config = subdir / "config.json"
style = subdir / "style_vectors.npy"
if safetensors and config.exists():
return safetensors[0], config, style if style.exists() else None
raise FileNotFoundError(
f"No .safetensors files found in {model_dir} or its subdirectories"
)
def _load_model(model_dir: Path, device: str) -> TTSModel:
model_path, config_path, style_path = _find_model_files(model_dir)
_LOGGER.info("Creating TTSModel (model=%s, config=%s, device=%s)",
model_path.name, config_path.name, device)
model = TTSModel(
model_path=model_path,
config_path=config_path,
style_vec_path=style_path,
device=device,
)
_LOGGER.info("Loading model weights...")
model.load()
net_g = getattr(model, "_TTSModel__net_g", None)
if net_g is not None:
net_g = net_g.float()
setattr(model, "_TTSModel__net_g", net_g)
_LOGGER.info("TTS network cast to float32")
_LOGGER.info("Model loaded successfully")
return model
class GLaDOSEventHandler(AsyncEventHandler):
def __init__(
self,
wyoming_info: Info,
model_dir: Path,
device: str,
*args,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.wyoming_info_event = wyoming_info.event()
self.model_dir = model_dir
self.device = device
async def handle_event(self, event: Event) -> bool:
if Describe.is_type(event.type):
await self.write_event(self.wyoming_info_event)
return True
if not Synthesize.is_type(event.type):
return True
synthesize = Synthesize.from_event(event)
return await self._handle_synthesize(synthesize)
async def _handle_synthesize(self, synthesize: Synthesize) -> bool:
global _MODEL
text = synthesize.text.strip()
if not text:
return True
language = _detect_language(text)
speaker_id = 0
style = "Neutral"
if synthesize.voice is not None and synthesize.voice.speaker:
try:
speaker_id = int(synthesize.voice.speaker)
except ValueError:
pass
_LOGGER.info("Synthesizing: text='%s' language=%s speaker=%s style=%s",
text[:80], language.name, speaker_id, style)
try:
async with _VOICE_LOCK:
if _MODEL is None:
_LOGGER.info("Loading GLaDOS model from %s on %s",
self.model_dir, self.device)
_MODEL = _load_model(self.model_dir, self.device)
_load_bert_for_language(language, self.device)
sr, audio = await asyncio.to_thread(
_MODEL.infer,
text=text,
language=language,
speaker_id=speaker_id,
style=style,
)
audio_int16 = np.round(audio).astype(np.int16)
raw_bytes = audio_int16.tobytes()
rate = sr
width = 2
channels = 1
await self.write_event(
AudioStart(rate=rate, width=width, channels=channels).event()
)
samples_per_chunk = 1024
bytes_per_sample = width * channels
bytes_per_chunk = bytes_per_sample * samples_per_chunk
for i in range(0, len(raw_bytes), bytes_per_chunk):
chunk = raw_bytes[i:i + bytes_per_chunk]
await self.write_event(
AudioChunk(
audio=chunk,
rate=rate,
width=width,
channels=channels,
).event()
)
await self.write_event(AudioStop().event())
return True
except Exception as err:
_LOGGER.exception("Synthesis failed")
await self.write_event(
Error(text=str(err), code=err.__class__.__name__).event()
)
return True