From d5dcfbf5f10591acad2820c7f668ddc639ec9988 Mon Sep 17 00:00:00 2001 From: joopd Date: Wed, 25 Feb 2026 09:57:46 +0100 Subject: [PATCH 1/2] feat: add --language flag for non-English TTS voices (Dutch support) - Add LANGUAGE default (en) to shell.functions - setup_python_venv downloads Dutch ONNX voices (pim, ronnie, nathalie) - wake_word_sample_generator uses multiple --model flags for single-speaker voices, cycling between them for variety - train_wake_word accepts and passes --language through the pipeline - recorder_server.py accepts language in session API - Web UI adds language dropdown (English/Dutch) --- cli/setup_python_venv | 28 ++++++++++++++ cli/shell.functions | 2 + cli/wake_word_sample_generator | 71 +++++++++++++++++++++++++++------- recorder_server.py | 12 +++++- static/index.html | 11 +++++- train_wake_word | 7 +++- 6 files changed, 113 insertions(+), 18 deletions(-) mode change 100644 => 100755 train_wake_word diff --git a/cli/setup_python_venv b/cli/setup_python_venv index 8d82211..1386497 100755 --- a/cli/setup_python_venv +++ b/cli/setup_python_venv @@ -157,6 +157,10 @@ pip_install -e "${PSG}" git -C tools/piper-sample-generator clean -fd &>/dev/null MODELS_DIR="${PSG}/models" +VOICES_DIR="${PSG}/voices" +mkdir -p "${VOICES_DIR}" + +# --- English generator model (multi-speaker, used with --language=en) --- MODEL_NAME="en_US-libritts_r-medium.pt" MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}" MODEL_URL="https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/${MODEL_NAME}" @@ -170,6 +174,30 @@ if [ ! -f "${MODEL_FILE}.json" ] ; then curl -sfL "${MODEL_URL}.json" -o "${MODEL_FILE}.json" fi +# --- Dutch ONNX voices (single-speaker, used with --language=nl) --- +# Working Dutch voices: pim, ronnie (nl_NL) and nathalie (nl_BE). +# nl_NL-mls-medium is intentionally excluded (known Piper issue: outputs gibberish). +HF_VOICES="https://huggingface.co/rhasspy/piper-voices/resolve/main" +declare -a NL_VOICES=( + "nl/nl_NL/pim/medium/nl_NL-pim-medium" + "nl/nl_NL/ronnie/medium/nl_NL-ronnie-medium" + "nl/nl_BE/nathalie/medium/nl_BE-nathalie-medium" +) +echo " ===== Checking Dutch Piper voices =====" +for voice_path in "${NL_VOICES[@]}" ; do + voice_name="$(basename "${voice_path}")" + onnx_file="${VOICES_DIR}/${voice_name}.onnx" + json_file="${VOICES_DIR}/${voice_name}.onnx.json" + if [ ! -f "${onnx_file}" ] ; then + echo " Downloading ${voice_name}.onnx" + curl -sfL "${HF_VOICES}/${voice_path}.onnx?download=true" -o "${onnx_file}" + fi + if [ ! -f "${json_file}" ] ; then + echo " Downloading ${voice_name}.onnx.json" + curl -sfL "${HF_VOICES}/${voice_path}.onnx.json?download=true" -o "${json_file}" + fi +done + ${GPU} && onnxgpu='-gpu[cuda]' || onnxgpu="" echo " ===== Installing onnxruntime${onnxgpu} =====" pip_install "onnxruntime${onnxgpu}>=1.16.0" diff --git a/cli/shell.functions b/cli/shell.functions index f933d46..01c63b0 100644 --- a/cli/shell.functions +++ b/cli/shell.functions @@ -11,12 +11,14 @@ fi DEFAULT_SAMPLES=50000 DEFAULT_BATCH_SIZE=100 DEFAULT_TRAINING_STEPS=40000 +DEFAULT_LANGUAGE=en [ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || : : "${SAMPLES:=${DEFAULT_SAMPLES}}" : "${BATCH_SIZE:=${DEFAULT_BATCH_SIZE}}" : "${TRAINING_STEPS:=${DEFAULT_TRAINING_STEPS}}" +: "${LANGUAGE:=${DEFAULT_LANGUAGE}}" : "${CLEANUP_WORK_DIR:=false}" : "${CLEANUP_ARCHIVES:=false}" : "${CLEANUP_INTERMEDIATE_FILES:=false}" diff --git a/cli/wake_word_sample_generator b/cli/wake_word_sample_generator index 3afcd6c..4166e75 100755 --- a/cli/wake_word_sample_generator +++ b/cli/wake_word_sample_generator @@ -4,7 +4,7 @@ set -e PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") -KNOWN_ARGS=( samples batch-size data-dir ) +KNOWN_ARGS=( samples batch-size data-dir language ) source "${PROGDIR}/shell.functions" WAKE_WORD="${POSITIONAL_ARGS[0]}" @@ -15,7 +15,8 @@ fi if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then cat <&2 -Usage: $0 [ --samples= ] [ --batch-size= ] +Usage: $0 [ --samples= ] [ --batch-size= ] + [ --language= ] --samples: The number of samples to generate for the wake word. Default: ${DEFAULT_SAMPLES} @@ -24,6 +25,12 @@ Usage: $0 [ --samples= ] [ --batch-size= ] samples, the more memory is needed. Default: ${DEFAULT_BATCH_SIZE} +--language: Language for TTS voice selection. + "en" uses the multi-speaker LibriTTS-R generator. + Other languages (e.g. "nl") use single-speaker ONNX + voices and cycle between them for variety. + Default: ${DEFAULT_LANGUAGE} + The word to generate samples for. Required. @@ -40,21 +47,60 @@ cd "${WORK_DIR}" PSG="${DATA_DIR}/tools/piper-sample-generator" MODELS_DIR="${PSG}/models" -MODEL_NAME=en_US-libritts_r-medium.pt -MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}" +VOICES_DIR="${PSG}/voices" SAMPLES_DIR="${WORK_DIR}/wake_word_samples" mkdir -p "${SAMPLES_DIR}" || : +# --------------------------------------------------------------------------- +# Build the --model argument(s) based on language +# --------------------------------------------------------------------------- +declare -a MODEL_ARGS=() +MODEL_TAG="" + +if [ "${LANGUAGE}" == "en" ] ; then + # English: use the multi-speaker LibriTTS-R generator (.pt) + MODEL_NAME="en_US-libritts_r-medium.pt" + MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}" + if [ ! -f "${MODEL_FILE}" ] ; then + echo "ERROR: English model ${MODEL_FILE} not found. Run setup_python_venv first." >&2 + exit 1 + fi + MODEL_ARGS=( --model "${MODEL_FILE}" ) + MODEL_TAG="${MODEL_NAME}" +else + # Non-English: find all ONNX voices matching the language prefix + # e.g. LANGUAGE=nl matches nl_NL-pim-medium.onnx, nl_BE-nathalie-medium.onnx, etc. + shopt -s nullglob + voice_files=( "${VOICES_DIR}/${LANGUAGE}"_*.onnx ) + shopt -u nullglob + + if [ ${#voice_files[@]} -eq 0 ] ; then + echo "ERROR: No ONNX voice files found for language '${LANGUAGE}' in ${VOICES_DIR}/" >&2 + echo " Expected files matching: ${LANGUAGE}_*.onnx" >&2 + echo " Run setup_python_venv to download voice models." >&2 + exit 1 + fi + + echo " Using ${#voice_files[@]} voice(s) for language '${LANGUAGE}':" + MODEL_TAG="${LANGUAGE}" + for vf in "${voice_files[@]}" ; do + vname="$(basename "${vf}")" + echo " - ${vname}" + MODEL_ARGS+=( --model "${vf}" ) + MODEL_TAG="${MODEL_TAG}+${vname}" + done +fi + REGENERATE=false if [ "${SAMPLES}" -eq 1 ] ; then - echo "===== Generating ${SAMPLES} sample of '${WAKE_WORD}' =====" - wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}" + echo "===== Generating ${SAMPLES} sample of '${WAKE_WORD}' (language=${LANGUAGE}) =====" + wake_word_filename="${WAKE_WORD//[ \`~\!@#\$%^&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}" mkdir -p "${WORK_DIR}/test_sample" || : "${PSG}/generate_samples.py" "${WAKE_WORD}" \ - --model "${MODEL_FILE}" \ + "${MODEL_ARGS[@]}" \ --max-samples ${SAMPLES} \ --batch-size ${BATCH_SIZE} \ --output-dir "${WORK_DIR}/test_sample" \ @@ -65,9 +111,9 @@ if [ "${SAMPLES}" -eq 1 ] ; then exit 0 fi -grep -q "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" "${WORK_DIR}/last_wake_word" &>/dev/null || REGENERATE=true +grep -q "${WAKE_WORD}:${SAMPLES}:${MODEL_TAG}" "${WORK_DIR}/last_wake_word" &>/dev/null || REGENERATE=true -# Double check that the number of existing samples matches SAMPLES" +# Double check that the number of existing samples matches SAMPLES existing_samples=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l) [ "${existing_samples}" -eq "${SAMPLES}" ] || REGENERATE=true @@ -79,7 +125,7 @@ if ! ${REGENERATE} ; then exit 0 fi -echo -e "\n===== Generating ${SAMPLES} wake word samples in batches of ${BATCH_SIZE} =====" +echo -e "\n===== Generating ${SAMPLES} wake word samples in batches of ${BATCH_SIZE} (language=${LANGUAGE}) =====" export TF_CPP_MIN_LOG_LEVEL=9 export TF_FORCE_GPU_ALLOW_GROWTH=true export TF_GPU_ALLOCATOR=cuda_malloc_async @@ -93,7 +139,7 @@ echo " Generating samples" rm -rf "${SAMPLES_DIR}" || : mkdir -p "${SAMPLES_DIR}" || : "${PSG}/generate_samples.py" "${WAKE_WORD}" \ - --model "${MODEL_FILE}" \ + "${MODEL_ARGS[@]}" \ --max-samples ${SAMPLES} \ --batch-size ${BATCH_SIZE} \ --output-dir "${SAMPLES_DIR}" 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g" @@ -103,8 +149,7 @@ if [ "${generated_files}" -ne "${SAMPLES}" ] ; then echo "ERROR: only generated ${generated_files} files" >&2 exit 1 fi -END_TS=$(date +%s.%N) -echo "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" > "${WORK_DIR}/last_wake_word" +echo "${WAKE_WORD}:${SAMPLES}:${MODEL_TAG}" > "${WORK_DIR}/last_wake_word" echo END_TS=$EPOCHSECONDS print_elapsed_time "${START_TS}" "${END_TS}" "Generated ${SAMPLES} wake word samples." diff --git a/recorder_server.py b/recorder_server.py index b0f82f0..ffc6f91 100644 --- a/recorder_server.py +++ b/recorder_server.py @@ -35,6 +35,8 @@ TRAIN_CMD = os.environ.get( f"source '{DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir '{DATA_DIR}'" ) +DEFAULT_LANGUAGE = os.environ.get("MWW_LANGUAGE", "en") + TAKES_PER_SPEAKER_DEFAULT = int(os.environ.get("REC_TAKES_PER_SPEAKER", "10")) SPEAKERS_TOTAL_DEFAULT = int(os.environ.get("REC_SPEAKERS_TOTAL", "1")) @@ -60,6 +62,7 @@ def safe_name(raw: str) -> str: STATE: Dict[str, Any] = { "raw_phrase": None, "safe_word": None, + "language": DEFAULT_LANGUAGE, "speakers_total": SPEAKERS_TOTAL_DEFAULT, "takes_per_speaker": TAKES_PER_SPEAKER_DEFAULT, @@ -372,6 +375,7 @@ def _normalize_output_artifacts(safe_word: str, log_path: Path) -> None: def _run_training_background(safe_word: str, allow_no_personal: bool): with STATE_LOCK: raw_phrase = STATE.get("raw_phrase") or "" + language = STATE.get("language") or DEFAULT_LANGUAGE wake_word_title = _title_from_phrase(raw_phrase) @@ -403,9 +407,9 @@ def _run_training_background(safe_word: str, allow_no_personal: bool): _ensure_training_datasets(log_path) if wake_word_title: - cmd_str = f"{TRAIN_CMD} '{safe_word}' '{wake_word_title}'" + cmd_str = f"{TRAIN_CMD} --language='{language}' '{safe_word}' '{wake_word_title}'" else: - cmd_str = f"{TRAIN_CMD} '{safe_word}'" + cmd_str = f"{TRAIN_CMD} --language='{language}' '{safe_word}'" env = os.environ.copy() env["MWW_ALLOW_NO_PERSONAL"] = "true" if allow_no_personal else "false" @@ -470,6 +474,7 @@ def start_session(payload: Dict[str, Any]): speakers_total = int(payload.get("speakers_total") or SPEAKERS_TOTAL_DEFAULT) takes_per_speaker = int(payload.get("takes_per_speaker") or TAKES_PER_SPEAKER_DEFAULT) + language = (payload.get("language") or DEFAULT_LANGUAGE).strip().lower() speakers_total = max(1, min(10, speakers_total)) takes_per_speaker = max(1, min(50, takes_per_speaker)) @@ -477,6 +482,7 @@ def start_session(payload: Dict[str, Any]): with STATE_LOCK: STATE["raw_phrase"] = raw STATE["safe_word"] = safe + STATE["language"] = language STATE["speakers_total"] = speakers_total STATE["takes_per_speaker"] = takes_per_speaker STATE["takes_received"] = 0 @@ -491,6 +497,7 @@ def start_session(payload: Dict[str, Any]): "ok": True, "raw_phrase": raw, "safe_word": safe, + "language": language, "speakers_total": speakers_total, "takes_per_speaker": takes_per_speaker, "takes_total": speakers_total * takes_per_speaker, @@ -506,6 +513,7 @@ def get_session(): "ok": True, "raw_phrase": STATE["raw_phrase"], "safe_word": STATE["safe_word"], + "language": STATE["language"], "speakers_total": STATE["speakers_total"], "takes_per_speaker": STATE["takes_per_speaker"], "takes_received": STATE["takes_received"], diff --git a/static/index.html b/static/index.html index ef9e449..4b57012 100644 --- a/static/index.html +++ b/static/index.html @@ -178,6 +178,12 @@
+ @@ -727,13 +733,14 @@ speakersTotal = parseInt($("speakersTotal").value || "1", 10); takesPerSpeaker = parseInt($("takesPerSpeaker").value || "10", 10); + const language = $("language").value || "en"; try { setPill($("sessionPill"), "Starting…", "warn"); const data = await api("/api/start_session", { method: "POST", headers: {"Content-Type":"application/json"}, - body: JSON.stringify({ phrase, speakers_total: speakersTotal, takes_per_speaker: takesPerSpeaker }) + body: JSON.stringify({ phrase, speakers_total: speakersTotal, takes_per_speaker: takesPerSpeaker, language }) }); session = data; @@ -755,7 +762,7 @@ await stopMicNow(); - setPill($("sessionPill"), `Session: ${data.safe_word}`, "ok"); + setPill($("sessionPill"), `Session: ${data.safe_word} (${data.language || "en"})`, "ok"); $("beginBtn").disabled = false; $("resetBtn").disabled = false; $("trainBtn").disabled = false; diff --git a/train_wake_word b/train_wake_word old mode 100644 new mode 100755 index 5546603..5fb7e99 --- a/train_wake_word +++ b/train_wake_word @@ -5,7 +5,7 @@ PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") CLIDIR="${PROGDIR}/cli" -KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir ) +KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir language ) source "${CLIDIR}/shell.functions" WAKE_WORD=${POSITIONAL_ARGS[0]} @@ -18,6 +18,7 @@ if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then cat <&2 Usage: train_wake_word [ --samples= ] [ --batch-size= ] [ --training-steps= ] [ --cleanup-work-dir ] + [ --language= ] [ ] Options: @@ -35,6 +36,9 @@ Options: --cleanup-work-dir: Delete the /data/work directory after successful training. Default: false +--language: Language for TTS voice selection (e.g. "en", "nl"). + Default: ${DEFAULT_LANGUAGE} + The word to train spelled phonetically. Required. @@ -88,6 +92,7 @@ export GRPC_VERBOSITY=ERROR "${CLIDIR}/wake_word_sample_generator" \ --samples=${SAMPLES} \ --batch-size=${BATCH_SIZE} \ + --language="${LANGUAGE}" \ --data-dir="${DATA_DIR}" "${WAKE_WORD}" POST_GEN_TS=$EPOCHSECONDS From 900f7410d79541a4cee3f45c4669c1d2ac9d89b0 Mon Sep 17 00:00:00 2001 From: joopd Date: Wed, 25 Feb 2026 10:38:10 +0100 Subject: [PATCH 2/2] chore: add data/ to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9223777..73999fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ personal_samples/* +data/ .DS_Store \ No newline at end of file