cli + web recorder ui

This commit is contained in:
MasterPhooey
2026-01-17 16:17:21 -06:00
parent b57fcd9b05
commit c52f92d3c9
8 changed files with 332 additions and 273 deletions

View File

@@ -67,42 +67,66 @@ find_rev() {
}
converter() {
source ${DATA_DIR}/.venv/bin/activate
# shellcheck source=/dev/null
source "${DATA_DIR}/.venv/bin/activate"
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
import os, sys, subprocess, scipy.io.wavfile, numpy as np
import os, sys
from pathlib import Path
import soundfile as sf
from datetime import datetime, timezone
import numpy as np
import scipy.io.wavfile
import librosa
from tqdm import tqdm
def write_wav(dst: Path, data: np.ndarray, sr: int):
dst.parent.mkdir(parents=True, exist_ok=True)
x = np.clip(data, -1.0, 1.0)
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
audioset_dir = Path(sys.argv[1])
audioset_out = Path(sys.argv[2])
# convert FLAC → 16k mono WAV
flacs = list(audioset_dir.rglob("*.flac"))
print(f" FLAC files: {len(flacs)}")
total = len(flacs)
print(f" FLAC files: {total}")
print(" Converting AudioSet → 16k mono WAV")
print(" Sit tight — this step can take a while.")
print("")
audioset_bad = []
ok = 0
for p in tqdm(flacs, desc=" AudioSet→WAV (resample 16k mono)"):
skipped = 0
START = datetime.now(timezone.utc).replace(microsecond=0)
# Heartbeat interval (prints every N files)
HEARTBEAT_EVERY = 500
for idx, p in enumerate(flacs, start=1):
try:
outfile = Path(audioset_out / (p.stem + ".wav"))
outfile = audioset_out / (p.stem + ".wav")
if outfile.exists():
continue
y, _ = librosa.load(p, sr=16000, mono=True)
if y.size == 0:
raise ValueError("empty audio")
write_wav(outfile, y, 16000)
ok += 1
skipped += 1
else:
y, _ = librosa.load(p, sr=16000, mono=True)
if y.size == 0:
raise ValueError("empty audio")
write_wav(outfile, y, 16000)
ok += 1
except Exception as e:
audioset_bad.append(f"{p}:{e}")
if idx == 1 or (idx % HEARTBEAT_EVERY) == 0 or idx == total:
print(f" Progress: {idx}/{total} (ok={ok}, skipped={skipped}, failed={len(audioset_bad)})")
if audioset_bad:
(audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
print(f" AudioSet complete ({ok} ok, {len(audioset_bad)} failed)")
END = datetime.now(timezone.utc).replace(microsecond=0)
elapsed = END - START
print("")
print(f" AudioSet complete ({ok} ok, {skipped} skipped, {len(audioset_bad)} failed) Elapsed: {elapsed}")
EOF
}
@@ -110,13 +134,15 @@ expected_filecount=$(get_total_filecount filecounts)
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
write_filecount=false
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
echo " Existing Audioset valid"
# Option B behavior: if we already have output WAVs, don't re-download/re-extract/re-convert
if [ "${actual_filecount}" -ne 0 ] ; then
echo " Existing ${AUDIO16K_DIR} present (${actual_filecount} wav); skipping extract/convert"
else
dl=$(find_rev)
[ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
rev=${dl%%,*}
pattern=${dl##*,}
echo " Checking 10 tarballs"
for i in {0..9} ; do
fname="downloads/bal_train0${i}.tar"
@@ -137,17 +163,16 @@ else
rm -rf "${fname}"
fi
done
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
converter
if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then
failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l)
filecounts[failed]=-${failed}
fi
# Recompute counts and warn (but do not fail)
expected_filecount=$(get_total_filecount filecounts)
actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || :
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
exit 1
echo " WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
fi
fi
@@ -171,5 +196,4 @@ if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
fi
echo " Audioset complete"
exit 0
exit 0

View File

@@ -8,9 +8,9 @@ if [ ! -v DATA_DIR ] ; then
[ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data"
fi
DEFAULT_SAMPLES=20000
DEFAULT_SAMPLES=50000
DEFAULT_BATCH_SIZE=100
DEFAULT_TRAINING_STEPS=25000
DEFAULT_TRAINING_STEPS=40000
[ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || :

View File

@@ -71,17 +71,16 @@ if not files:
max_samples = len(files)
print(f"\n===== Augmenting {max_samples} wake word samples =====")
print(" Initializing libraries")
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
os.environ["TF_XLA_FLAGS"]="--tf_xla_auto_jit=0"
os.environ["NVIDIA_TF32_OVERRIDE"]="1"
os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"]="512"
os.environ["GLOG_minloglevel"]="9"
os.environ["GRPC_VERBOSITY"]="ERROR"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
os.environ["TF_XLA_FLAGS"] = "--tf_xla_auto_jit=0"
os.environ["NVIDIA_TF32_OVERRIDE"] = "1"
os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"] = "512"
os.environ["GLOG_minloglevel"] = "9"
os.environ["GRPC_VERBOSITY"] = "ERROR"
print(" Loading Tensorflow")
import tensorflow as tf
@@ -98,6 +97,7 @@ gc.collect()
import numpy as np
import librosa
from tqdm import tqdm
from mmap_ninja.ragged import RaggedMmap
from microwakeword.audio.augmentation import Augmentation
from microwakeword.audio.clips import Clips
@@ -108,7 +108,7 @@ START_TIME = datetime.now(timezone.utc).replace(microsecond=0)
# Paths to augmented data
impulse_paths = [ args.mit_rirs_16k_dir ]
background_paths = [ args.fma_16k_dir, args.audioset_16k_dir]
background_paths = [ args.fma_16k_dir, args.audioset_16k_dir ]
clips = Clips(
input_directory=args.input_dir,
@@ -139,8 +139,6 @@ augmenter = Augmentation(
max_jitter_s=0.3,
)
# Augment samples and save the training, validation, and testing sets.
def audio_generator_from_wavs(self, split="train", repeat=1):
"""
Yield 1-D float32 arrays loaded via librosa from input_dir/*.wav.
@@ -175,7 +173,7 @@ def audio_generator_from_wavs(self, split="train", repeat=1):
# Bind the patched generator to your existing `clips` instance
clips.audio_generator = types.MethodType(audio_generator_from_wavs, clips)
# ---- Split config (same as before) ----
# ---- Split config ----
split_cfg = {
"training": {"name": "train", "repetition": 2, "slide_frames": 10},
"validation": {"name": "validation", "repetition": 1, "slide_frames": 10},
@@ -188,28 +186,34 @@ for split, cfg in split_cfg.items():
out_dir.mkdir(parents=True, exist_ok=True)
print(f" Augmenting {split}")
print(f" Generating spectrograms")
print(" Generating spectrograms")
spectros = SpectrogramGeneration(
clips=clips, # now backed by our WAV loader
augmenter=augmenter, # your existing augmenter
clips=clips,
augmenter=augmenter,
slide_frames=cfg["slide_frames"],
step_ms=10,
)
print(f" Generating files")
print(" Generating files")
print(" Sit tight — this step can take a while.")
gen = spectros.spectrogram_generator(
split=cfg["name"],
repeat=cfg["repetition"],
)
RaggedMmap.from_generator(
out_dir=str(out_dir / "wakeword_mmap"),
sample_generator=spectros.spectrogram_generator(
split=cfg["name"], repeat=cfg["repetition"]
),
sample_generator=gen,
batch_size=100,
verbose=False,
verbose=False, # keep mmap quiet
)
print(f" {split} augmentation complete")
END_TIME = datetime.now(timezone.utc).replace(microsecond=0)
et = END_TIME - START_TIME
print(f"\n{'=' * 80}")
msg=f"Augmented {max_samples} wake word samples."
msg = f"Augmented {max_samples} wake word samples."
print(f"{msg:>50s} Elapsed time: {et!s}")
print(f"{'=' * 80}\n")
print(f"{'=' * 80}\n")

View File

@@ -129,88 +129,136 @@ EOF
echo " Wrote training_parameters.yaml"
rm -rf "${WORK_DIR}/trained_models/wakeword"
export TF_CPP_MIN_LOG_LEVEL=9
export TF_FORCE_GPU_ALLOW_GROWTH=true
export TF_GPU_ALLOCATOR=cuda_malloc_async
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
export NVIDIA_TF32_OVERRIDE=1
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
export GLOG_minloglevel=9
export GRPC_VERBOSITY=ERROR
echo " Loading Tensorflow"
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*$begin:math:text$$end:math:text$\{\}$begin:math:display$$end:math:display$\|\;\'\"<>.?\/]/_}"
OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}"
mkdir -p "${OUTPUT_DIR}/logs" || :
python - \
--training_config="${WORK_DIR}/trained_models/training_parameters.yaml" \
--train 1 \
--restore_checkpoint 1 \
--test_tf_nonstreaming 0 \
--test_tflite_nonstreaming 0 \
--test_tflite_nonstreaming_quantized 0 \
--test_tflite_streaming 0 \
--test_tflite_streaming_quantized 1 \
--use_weights "best_weights" \
mixednet \
--pointwise_filters "64,64,64,64" \
--repeat_in_block "1,1,1,1" \
--mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" \
--residual_connection "0,0,0,0" \
--first_conv_filters 32 \
--first_conv_kernel_size 5 \
--stride 2 <<EOF 2>&1 | tr '\r' '\n' | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" |\
tee "${OUTPUT_DIR}/logs/training.log" | sed -r -e '/^INFO:absl:/!d' \
-r -e "/None|Sharding|unsupported characters|AUC|fingerprint/d" \
-r -e 's/INFO:absl:/ /g' \
-r -e "s/, (recall =|estimated false|average viable recall)/,\n \1/g"
TRAIN_LOG="${OUTPUT_DIR}/logs/training.log"
import sys, os, gc
import runpy
import yaml
print(" Loading Tensorflow")
import tensorflow as tf
# ------------------------------------------------------------------
# Training args (same as before)
# ------------------------------------------------------------------
TRAIN_ARGS=(
-m microwakeword.model_train_eval
--training_config "${WORK_DIR}/trained_models/training_parameters.yaml"
--train 1
--restore_checkpoint 1
--test_tf_nonstreaming 0
--test_tflite_nonstreaming 0
--test_tflite_nonstreaming_quantized 0
--test_tflite_streaming 0
--test_tflite_streaming_quantized 1
--use_weights best_weights
mixednet
--pointwise_filters "64,64,64,64"
--repeat_in_block "1,1,1,1"
--mixconv_kernel_sizes "[5], [7,11], [9,15], [23]"
--residual_connection "0,0,0,0"
--first_conv_filters 32
--first_conv_kernel_size 5
--stride 2
)
print(" GPU memory config")
# Per-device memory growth (belt + suspenders)
for g in tf.config.list_physical_devices("GPU"):
try:
tf.config.experimental.set_memory_growth(g, True)
except Exception:
pass
print(f"INFO:absl:GPUs: {tf.config.list_physical_devices('GPU')}")
gc.collect()
# ------------------------------------------------------------------
# GPU failure markers that should trigger CPU fallback
# (OOM + known GPU runtime/copy/init failures)
# ------------------------------------------------------------------
GPU_FALLBACK_MARKERS=(
"resourceexhaustederror"
"resource exhausted"
"oom"
"out of memory"
"cuda_error_out_of_memory"
"failed to allocate"
"cudnn"
"cublas"
"internalerror: cuda"
"failed call to cuinit"
"dst tensor is not initialized"
"failed copying input tensor"
"_eagerconst"
)
print()
try:
runpy.run_module("microwakeword.model_train_eval", run_name="__main__", alter_sys=True)
except Exception as e:
print(e, file=sys.stderr)
sys.exit(1)
EOF
run_attempt() {
local label="$1"
shift
echo
echo "================================================================================"
echo "===== ${label} ====="
echo "================================================================================"
echo "→ ${PYTHON_BIN:-python} ${TRAIN_ARGS[*]}"
echo
# stream everything except validation minibatch spam
"${PYTHON_BIN:-python}" "${TRAIN_ARGS[@]}" 2>&1 \
| tr '\r' '\n' \
| stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" \
| tee "${TRAIN_LOG}" \
| sed -r -e "/^Validation Batch/d" -e "s/^INFO:absl:/ /g"
return ${PIPESTATUS[0]}
}
# ---- Common TF env (mirrors your notebook) ----
export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
# Attempt 1: GPU
if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
echo "✅ Training complete (GPU path)."
else
echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
# Check log for GPU/OOM/runtime markers
log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
looks_like_gpu_fail="false"
for m in "${GPU_FALLBACK_MARKERS[@]}"; do
if echo "${log_lc}" | grep -qF "${m}"; then
looks_like_gpu_fail="true"
break
fi
done
if [ "${looks_like_gpu_fail}" = "true" ]; then
echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU."
# Attempt 2: CPU (hide GPU completely)
export CUDA_VISIBLE_DEVICES=""
unset TF_GPU_ALLOCATOR
if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
echo "✅ Training complete (CPU fallback)."
else
echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
exit 1
fi
else
echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
exit 1
fi
fi
source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite"
if [ ! -f "${source_path}" ] ; then
echo "Output model not found! Training didn't complete successfully. See ${WORK_DIR}/training.log"
echo "Output model not found! Training didn't complete successfully. See ${TRAIN_LOG}"
exit 1
fi
cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/"
cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/"
cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/"
cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/" || :
cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/" || :
cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/" || :
echo -e "\n Training complete!"
echo " Full log: ${OUTPUT_DIR}/logs/training.log"
echo " Full log: ${TRAIN_LOG}"
tflite_filename="${wake_word_filename}.tflite"
tflite_path="${OUTPUT_DIR}/${tflite_filename}"
cp "${source_path}" "${tflite_path}"
# --- Write JSON metadata file with matching model name ---
json_path="${OUTPUT_DIR}/${wake_word_filename}.json"
cat <<-EOF > "${json_path}"
{
@@ -237,5 +285,4 @@ echo "Metadata: ${json_path}"
echo
END_TS=$EPOCHSECONDS
print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
echo
echo