cli + web recorder ui

2026-06-12 20:10:19 -06:00 · 2026-01-17 16:17:21 -06:00
parent b57fcd9b05
commit c52f92d3c9
8 changed files with 332 additions and 273 deletions
--- a/cli/setup_audioset
+++ b/cli/setup_audioset
@@ -67,42 +67,66 @@ find_rev() {
 }

 converter() {
-    source ${DATA_DIR}/.venv/bin/activate
+    # shellcheck source=/dev/null
+    source "${DATA_DIR}/.venv/bin/activate"
+
    python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
-import os, sys, subprocess, scipy.io.wavfile, numpy as np
+import os, sys
 from pathlib import Path
-import soundfile as sf
+from datetime import datetime, timezone
+
+import numpy as np
+import scipy.io.wavfile
 import librosa
-from tqdm import tqdm

 def write_wav(dst: Path, data: np.ndarray, sr: int):
+    dst.parent.mkdir(parents=True, exist_ok=True)
    x = np.clip(data, -1.0, 1.0)
    scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))

 audioset_dir = Path(sys.argv[1])
 audioset_out = Path(sys.argv[2])

-# convert FLAC → 16k mono WAV
 flacs = list(audioset_dir.rglob("*.flac"))
-print(f"   FLAC files: {len(flacs)}")
+total = len(flacs)
+print(f"   FLAC files: {total}")
+print("   Converting AudioSet → 16k mono WAV")
+print("   Sit tight — this step can take a while.")
+print("")
+
 audioset_bad = []
 ok = 0
-for p in tqdm(flacs, desc="   AudioSet→WAV (resample 16k mono)"):
+skipped = 0
+
+START = datetime.now(timezone.utc).replace(microsecond=0)
+
+# Heartbeat interval (prints every N files)
+HEARTBEAT_EVERY = 500
+
+for idx, p in enumerate(flacs, start=1):
    try:
-        outfile = Path(audioset_out / (p.stem + ".wav"))
+        outfile = audioset_out / (p.stem + ".wav")
        if outfile.exists():
-            continue
-        y, _ = librosa.load(p, sr=16000, mono=True)
-        if y.size == 0:
-            raise ValueError("empty audio")
-        write_wav(outfile, y, 16000)
-        ok += 1
+            skipped += 1
+        else:
+            y, _ = librosa.load(p, sr=16000, mono=True)
+            if y.size == 0:
+                raise ValueError("empty audio")
+            write_wav(outfile, y, 16000)
+            ok += 1
    except Exception as e:
        audioset_bad.append(f"{p}:{e}")

+    if idx == 1 or (idx % HEARTBEAT_EVERY) == 0 or idx == total:
+        print(f"   Progress: {idx}/{total}  (ok={ok}, skipped={skipped}, failed={len(audioset_bad)})")
+
 if audioset_bad:
    (audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
-print(f"   AudioSet complete ({ok} ok, {len(audioset_bad)} failed)")
+
+END = datetime.now(timezone.utc).replace(microsecond=0)
+elapsed = END - START
+print("")
+print(f"   AudioSet complete ({ok} ok, {skipped} skipped, {len(audioset_bad)} failed)  Elapsed: {elapsed}")
 EOF
 }

@@ -110,13 +134,15 @@ expected_filecount=$(get_total_filecount filecounts)
 actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
 write_filecount=false

-if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
-    echo "   Existing Audioset valid"
+# Option B behavior: if we already have output WAVs, don't re-download/re-extract/re-convert
+if [ "${actual_filecount}" -ne 0 ] ; then
+    echo "   Existing ${AUDIO16K_DIR} present (${actual_filecount} wav); skipping extract/convert"
 else
    dl=$(find_rev)
    [ -n "$dl" ] || { echo "   Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
    rev=${dl%%,*}
    pattern=${dl##*,}
+
    echo "   Checking 10 tarballs"
    for i in {0..9} ; do
        fname="downloads/bal_train0${i}.tar"
@@ -137,17 +163,16 @@ else
            rm -rf "${fname}"
        fi
    done
+
    rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
    converter
-    if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then
-        failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l)
-        filecounts[failed]=-${failed}
-    fi
+
+    # Recompute counts and warn (but do not fail)
    expected_filecount=$(get_total_filecount filecounts)
-    actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || :
+    actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
    if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
        echo "   Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
-        exit 1
+        echo "   WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
    fi
 fi

@@ -171,5 +196,4 @@ if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
 fi

 echo "   Audioset complete"
-exit 0
-
+exit 0
--- a/cli/shell.functions
+++ b/cli/shell.functions
@@ -8,9 +8,9 @@ if [ ! -v DATA_DIR ] ; then
    [ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data"
 fi

-DEFAULT_SAMPLES=20000
+DEFAULT_SAMPLES=50000
 DEFAULT_BATCH_SIZE=100
-DEFAULT_TRAINING_STEPS=25000
+DEFAULT_TRAINING_STEPS=40000

 [ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || :

--- a/cli/wake_word_sample_augmenter
+++ b/cli/wake_word_sample_augmenter
@@ -71,17 +71,16 @@ if not files:
 max_samples = len(files)

 print(f"\n===== Augmenting {max_samples} wake word samples =====")
-
 print("   Initializing libraries")

-os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
-os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
-os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
-os.environ["TF_XLA_FLAGS"]="--tf_xla_auto_jit=0"
-os.environ["NVIDIA_TF32_OVERRIDE"]="1"
-os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"]="512"
-os.environ["GLOG_minloglevel"]="9"
-os.environ["GRPC_VERBOSITY"]="ERROR"
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
+os.environ["TF_XLA_FLAGS"] = "--tf_xla_auto_jit=0"
+os.environ["NVIDIA_TF32_OVERRIDE"] = "1"
+os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"] = "512"
+os.environ["GLOG_minloglevel"] = "9"
+os.environ["GRPC_VERBOSITY"] = "ERROR"

 print("   Loading Tensorflow")
 import tensorflow as tf
@@ -98,6 +97,7 @@ gc.collect()

 import numpy as np
 import librosa
+from tqdm import tqdm
 from mmap_ninja.ragged import RaggedMmap
 from microwakeword.audio.augmentation import Augmentation
 from microwakeword.audio.clips import Clips
@@ -108,7 +108,7 @@ START_TIME = datetime.now(timezone.utc).replace(microsecond=0)

 # Paths to augmented data
 impulse_paths = [ args.mit_rirs_16k_dir ]
-background_paths = [ args.fma_16k_dir, args.audioset_16k_dir]
+background_paths = [ args.fma_16k_dir, args.audioset_16k_dir ]

 clips = Clips(
    input_directory=args.input_dir,
@@ -139,8 +139,6 @@ augmenter = Augmentation(
    max_jitter_s=0.3,
 )

-# Augment samples and save the training, validation, and testing sets.
-
 def audio_generator_from_wavs(self, split="train", repeat=1):
    """
    Yield 1-D float32 arrays loaded via librosa from input_dir/*.wav.
@@ -175,7 +173,7 @@ def audio_generator_from_wavs(self, split="train", repeat=1):
 # Bind the patched generator to your existing `clips` instance
 clips.audio_generator = types.MethodType(audio_generator_from_wavs, clips)

-# ---- Split config (same as before) ----
+# ---- Split config ----
 split_cfg = {
    "training":   {"name": "train",      "repetition": 2, "slide_frames": 10},
    "validation": {"name": "validation", "repetition": 1, "slide_frames": 10},
@@ -188,28 +186,34 @@ for split, cfg in split_cfg.items():
    out_dir.mkdir(parents=True, exist_ok=True)
    print(f"   Augmenting {split}")

-    print(f"      Generating spectrograms")
+    print("      Generating spectrograms")
    spectros = SpectrogramGeneration(
-        clips=clips,                 # now backed by our WAV loader
-        augmenter=augmenter,         # your existing augmenter
+        clips=clips,
+        augmenter=augmenter,
        slide_frames=cfg["slide_frames"],
        step_ms=10,
    )

-    print(f"      Generating files")
+    print("      Generating files")
+    print("      Sit tight — this step can take a while.")
+
+    gen = spectros.spectrogram_generator(
+        split=cfg["name"],
+        repeat=cfg["repetition"],
+    )
+
    RaggedMmap.from_generator(
        out_dir=str(out_dir / "wakeword_mmap"),
-        sample_generator=spectros.spectrogram_generator(
-            split=cfg["name"], repeat=cfg["repetition"]
-        ),
+        sample_generator=gen,
        batch_size=100,
-        verbose=False,
+        verbose=False,   # keep mmap quiet
    )
+
    print(f"      {split} augmentation complete")

 END_TIME = datetime.now(timezone.utc).replace(microsecond=0)
 et = END_TIME - START_TIME
 print(f"\n{'=' * 80}")
-msg=f"Augmented {max_samples} wake word samples."
+msg = f"Augmented {max_samples} wake word samples."
 print(f"{msg:>50s} Elapsed time: {et!s}")
-print(f"{'=' * 80}\n")
+print(f"{'=' * 80}\n")
--- a/cli/wake_word_sample_trainer
+++ b/cli/wake_word_sample_trainer
@@ -129,88 +129,136 @@ EOF
 echo "   Wrote training_parameters.yaml"
 rm -rf "${WORK_DIR}/trained_models/wakeword"

-export TF_CPP_MIN_LOG_LEVEL=9
-export TF_FORCE_GPU_ALLOW_GROWTH=true
-export TF_GPU_ALLOCATOR=cuda_malloc_async
-export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
-export NVIDIA_TF32_OVERRIDE=1
-export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
-export GLOG_minloglevel=9
-export GRPC_VERBOSITY=ERROR
-
-echo "   Loading Tensorflow"
-
-wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
+wake_word_filename="${WAKE_WORD//[ \`~\!\$&*$begin:math:text$$end:math:text$\{\}$begin:math:display$$end:math:display$\|\;\'\"<>.?\/]/_}"
 OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}"
 mkdir -p "${OUTPUT_DIR}/logs" || :

-python - \
-  --training_config="${WORK_DIR}/trained_models/training_parameters.yaml" \
-  --train 1 \
-  --restore_checkpoint 1 \
-  --test_tf_nonstreaming 0 \
-  --test_tflite_nonstreaming 0 \
-  --test_tflite_nonstreaming_quantized 0 \
-  --test_tflite_streaming 0 \
-  --test_tflite_streaming_quantized 1 \
-  --use_weights "best_weights" \
-  mixednet \
-  --pointwise_filters "64,64,64,64" \
-  --repeat_in_block "1,1,1,1" \
-  --mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" \
-  --residual_connection "0,0,0,0" \
-  --first_conv_filters 32 \
-  --first_conv_kernel_size 5 \
-  --stride 2 <<EOF 2>&1 | tr '\r' '\n' | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" |\
-        tee "${OUTPUT_DIR}/logs/training.log" | sed -r -e '/^INFO:absl:/!d' \
-            -r -e "/None|Sharding|unsupported characters|AUC|fingerprint/d" \
-            -r -e 's/INFO:absl:/   /g' \
-            -r -e "s/, (recall =|estimated false|average viable recall)/,\n      \1/g"
+TRAIN_LOG="${OUTPUT_DIR}/logs/training.log"

-import sys, os, gc
-import runpy
-import yaml
-print("   Loading Tensorflow")
-import tensorflow as tf
+# ------------------------------------------------------------------
+# Training args (same as before)
+# ------------------------------------------------------------------
+TRAIN_ARGS=(
+  -m microwakeword.model_train_eval
+  --training_config "${WORK_DIR}/trained_models/training_parameters.yaml"
+  --train 1
+  --restore_checkpoint 1
+  --test_tf_nonstreaming 0
+  --test_tflite_nonstreaming 0
+  --test_tflite_nonstreaming_quantized 0
+  --test_tflite_streaming 0
+  --test_tflite_streaming_quantized 1
+  --use_weights best_weights
+  mixednet
+  --pointwise_filters "64,64,64,64"
+  --repeat_in_block "1,1,1,1"
+  --mixconv_kernel_sizes "[5], [7,11], [9,15], [23]"
+  --residual_connection "0,0,0,0"
+  --first_conv_filters 32
+  --first_conv_kernel_size 5
+  --stride 2
+)

-print("   GPU memory config")
-# Per-device memory growth (belt + suspenders)
-for g in tf.config.list_physical_devices("GPU"):
-    try:
-        tf.config.experimental.set_memory_growth(g, True)
-    except Exception:
-        pass
-print(f"INFO:absl:GPUs: {tf.config.list_physical_devices('GPU')}")
-gc.collect()
+# ------------------------------------------------------------------
+# GPU failure markers that should trigger CPU fallback
+# (OOM + known GPU runtime/copy/init failures)
+# ------------------------------------------------------------------
+GPU_FALLBACK_MARKERS=(
+  "resourceexhaustederror"
+  "resource exhausted"
+  "oom"
+  "out of memory"
+  "cuda_error_out_of_memory"
+  "failed to allocate"
+  "cudnn"
+  "cublas"
+  "internalerror: cuda"
+  "failed call to cuinit"
+  "dst tensor is not initialized"
+  "failed copying input tensor"
+  "_eagerconst"
+)

-print()
-try:
-    runpy.run_module("microwakeword.model_train_eval", run_name="__main__", alter_sys=True)
-except Exception as e:
-    print(e, file=sys.stderr)
-    sys.exit(1)
-EOF
+run_attempt() {
+  local label="$1"
+  shift
+  echo
+  echo "================================================================================"
+  echo "===== ${label} ====="
+  echo "================================================================================"
+  echo "→ ${PYTHON_BIN:-python} ${TRAIN_ARGS[*]}"
+  echo
+
+  # stream everything except validation minibatch spam
+  "${PYTHON_BIN:-python}" "${TRAIN_ARGS[@]}" 2>&1 \
+    | tr '\r' '\n' \
+    | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" \
+    | tee "${TRAIN_LOG}" \
+    | sed -r -e "/^Validation Batch/d" -e "s/^INFO:absl:/   /g"
+
+  return ${PIPESTATUS[0]}
+}
+
+# ---- Common TF env (mirrors your notebook) ----
+export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
+export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
+export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
+export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
+export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
+
+# Attempt 1: GPU
+if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
+  echo "✅ Training complete (GPU path)."
+else
+  echo "⚠️  GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
+
+  # Check log for GPU/OOM/runtime markers
+  log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
+  looks_like_gpu_fail="false"
+  for m in "${GPU_FALLBACK_MARKERS[@]}"; do
+    if echo "${log_lc}" | grep -qF "${m}"; then
+      looks_like_gpu_fail="true"
+      break
+    fi
+  done
+
+  if [ "${looks_like_gpu_fail}" = "true" ]; then
+    echo "↪️  Detected GPU/OOM/runtime failure markers. Falling back to CPU."
+
+    # Attempt 2: CPU (hide GPU completely)
+    export CUDA_VISIBLE_DEVICES=""
+    unset TF_GPU_ALLOCATOR
+    if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
+      echo "✅ Training complete (CPU fallback)."
+    else
+      echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
+      exit 1
+    fi
+  else
+    echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
+    exit 1
+  fi
+fi

 source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite"

 if [ ! -f "${source_path}" ] ; then
-    echo "Output model not found! Training didn't complete successfully.  See ${WORK_DIR}/training.log"
+    echo "Output model not found! Training didn't complete successfully.  See ${TRAIN_LOG}"
    exit 1
 fi

-cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/"
-cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/"
-cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/"
+cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/" || :
+cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/" || :
+cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/" || :

 echo -e "\n   Training complete!"
-echo "   Full log: ${OUTPUT_DIR}/logs/training.log"
+echo "   Full log: ${TRAIN_LOG}"

 tflite_filename="${wake_word_filename}.tflite"
 tflite_path="${OUTPUT_DIR}/${tflite_filename}"

 cp "${source_path}" "${tflite_path}"

-# --- Write JSON metadata file with matching model name ---
 json_path="${OUTPUT_DIR}/${wake_word_filename}.json"
 cat <<-EOF > "${json_path}"
 {
@@ -237,5 +285,4 @@ echo "Metadata: ${json_path}"
 echo
 END_TS=$EPOCHSECONDS
 print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
-echo
-
+echo