From cef3daa1a727f119d65d21b0922424c746a97b29 Mon Sep 17 00:00:00 2001 From: Tater Totterson Date: Sat, 27 Sep 2025 14:55:23 -0500 Subject: [PATCH] cleanup --- dockerfile | 9 +- microWakeWord_training_notebook.ipynb | 277 ++++++++++++++------------ 2 files changed, 152 insertions(+), 134 deletions(-) diff --git a/dockerfile b/dockerfile index e843467..c68809b 100644 --- a/dockerfile +++ b/dockerfile @@ -1,12 +1,14 @@ # CUDA + cuDNN userspace from NVIDIA (Ubuntu 22.04) -FROM nvidia/cuda:12.6.2-cudnn-runtime-ubuntu22.04 - +FROM nvidia/cuda:12.6.2-cudnn-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ PIP_NO_CACHE_DIR=1 \ PIP_ROOT_USER_ACTION=ignore \ - HF_HUB_DISABLE_SYMLINKS_WARNING=1 + HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" \ + PATH="/usr/local/cuda/bin:${PATH}" \ + LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" # System deps (+dev headers for building C/C++ extensions) RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -22,7 +24,6 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \ && update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 # Python deps -# Pre-install numpy (and cython) so native builds (webrtcvad, pymicro_features) have headers ready COPY requirements.txt /tmp/requirements.txt RUN pip install --upgrade pip \ && pip install "numpy==1.26.4" "cython>=0.29.36" \ diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb index 1540f09..41900c6 100644 --- a/microWakeWord_training_notebook.ipynb +++ b/microWakeWord_training_notebook.ipynb @@ -1,27 +1,39 @@ { "cells": [ { - "cell_type": "markdown", - "metadata": { - "id": "r11cNiLqvWC6" - }, + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "
\n", - " \"MicroWakeWord\n", - "

MicroWakeWord Trainer Docker

\n", - "
\n", + "# πŸ₯” MicroWakeWord Trainer β€” Tater Totterson Edition\n", + "# ==================================================\n", + "# Welcome, friend! πŸ‘‹ This notebook will help you train your very own wake word model.\n", + "# Think of it like teaching Tater Totterson to recognize when you say a special word.\n", + "#\n", + "# By the end, you'll have:\n", + "# βœ… A trained TensorFlow Lite model ready for on-device detection.\n", + "# βœ… A matching JSON manifest you can drop straight into ESPHome.\n", + "#\n", + "# This flow is optimized for Python 3.10 and NVIDIA GPUs (but should work elsewhere too).\n", + "# You can customize the wake word, play with training parameters, and experiment with\n", + "# different datasets until you get something that feels just right. πŸ’ͺ\n", + "#\n", + "# ⚑ Quick Tips:\n", + "# β€’ Change TARGET_WORD below to whatever you want your wake word to be.\n", + "# β€’ Rerun the notebook from the top if you change it (to regenerate everything).\n", + "# β€’ Expect to experiment β€” tweaking hyperparameters is part of the fun!\n", + "#\n", + "# When you’re done, you’ll get two files:\n", + "# 1️⃣ .tflite β€” your trained model.\n", + "# 2️⃣ .json β€” a manifest for ESPHome integration.\n", + "#\n", + "# More info & examples:\n", + "# πŸ”— https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n", "\n", - "This notebook steps you through training a robust microWakeWord model. It is intended as a **starting point** for users looking to create a high-performance wake word detection model. This notebook is optimized for Python 3.10.\n", - "\n", - "**The model generated from this notebook is designed for practical use, but achieving optimal performance will require experimentation with various settings and datasets. The provided scripts and configurations aim to give you a strong foundation to build upon.**\n", - "\n", - "Throughout the notebook, you will find comments suggesting specific settings to modify and experiment with to enhance your model's performance.\n", - "\n", - "By the end of this notebook, you will have:\n", - "- A trained TensorFlow Lite model ready for deployment.\n", - "- A JSON manifest file to integrate the model with ESPHome.\n", - "\n", - "To use the generated model in ESPHome, refer to the [ESPHome documentation](https://esphome.io/components/micro_wake_word) for integration details. You can also explore example configurations in the [model repository](https://github.com/esphome/micro-wake-word-models/tree/main/models/v2)." + "# --- Set your wake word here ---\n", + "TARGET_WORD = \"hey_tater\" # πŸ—£οΈ Change this to whatever phrase you want!\n", + "print(f\"πŸ₯” Tater Totterson is listening for: '{TARGET_WORD}'\")" ] }, { @@ -224,7 +236,6 @@ "import sys, subprocess\n", "from pathlib import Path\n", "\n", - "target_word = \"hey_tater\"\n", "REPO_DIR = Path.cwd() / \"piper-sample-generator\"\n", "MODELS_DIR = REPO_DIR / \"models\"\n", "MODEL_NAME = \"en_US-libritts_r-medium.pt\"\n", @@ -661,83 +672,74 @@ }, "outputs": [], "source": [ - "# Save a yaml config that controls the training process\n", - "# These hyperparamters can make a huge different in model quality.\n", - "# Experiment with sampling and penalty weights and increasing the number of\n", - "# training steps.\n", + "# GPU memory config (set env BEFORE importing TF)\n", + "import os, sys, gc\n", + "\n", + "if \"tensorflow\" not in sys.modules:\n", + " os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n", + " os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n", + " os.environ[\"XLA_FLAGS\"] = \"--xla_gpu_cuda_data_dir=/usr/local/cuda\"\n", + " os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n", + "import tensorflow as tf\n", + "\n", + "# Per-device memory growth (belt + suspenders)\n", + "for g in tf.config.list_physical_devices(\"GPU\"):\n", + " try:\n", + " tf.config.experimental.set_memory_growth(g, True)\n", + " except Exception:\n", + " pass\n", + "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n", + "gc.collect()\n", + "\n", + "# Optional but recommended: mixed precision halves activation memory\n", + "try:\n", + " from tensorflow.keras import mixed_precision\n", + " mixed_precision.set_global_policy(\"mixed_float16\")\n", + " print(\"Mixed precision policy:\", mixed_precision.global_policy())\n", + "except Exception as e:\n", + " print(\"Mixed precision not enabled:\", e)\n", + "\n", + "# --- Save a yaml config that controls the training process ---\n", "\n", "import yaml\n", - "import os\n", "\n", "config = {}\n", "\n", "config[\"window_step_ms\"] = 10\n", - "\n", "config[\"train_dir\"] = \"trained_models/wakeword\"\n", "\n", "config[\"features\"] = [\n", - " {\n", - " \"features_dir\": \"generated_augmented_features\",\n", - " \"sampling_weight\": 2.0, # Increased\n", - " \"penalty_weight\": 1.0,\n", - " \"truth\": True,\n", - " \"truncation_strategy\": \"truncate_start\",\n", - " \"type\": \"mmap\",\n", - " },\n", - " {\n", - " \"features_dir\": \"negative_datasets/speech\",\n", - " \"sampling_weight\": 12.0, # Adjusted\n", - " \"penalty_weight\": 1.0,\n", - " \"truth\": False,\n", - " \"truncation_strategy\": \"random\",\n", - " \"type\": \"mmap\",\n", - " },\n", - " {\n", - " \"features_dir\": \"negative_datasets/dinner_party\",\n", - " \"sampling_weight\": 12.0, # Adjusted\n", - " \"penalty_weight\": 1.0,\n", - " \"truth\": False,\n", - " \"truncation_strategy\": \"random\",\n", - " \"type\": \"mmap\",\n", - " },\n", - " {\n", - " \"features_dir\": \"negative_datasets/no_speech\",\n", - " \"sampling_weight\": 5.0, # Balanced\n", - " \"penalty_weight\": 1.0,\n", - " \"truth\": False,\n", - " \"truncation_strategy\": \"random\",\n", - " \"type\": \"mmap\",\n", - " },\n", - " {\n", - " \"features_dir\": \"negative_datasets/dinner_party_eval\",\n", - " \"sampling_weight\": 0.0,\n", - " \"penalty_weight\": 1.0,\n", - " \"truth\": False,\n", - " \"truncation_strategy\": \"split\",\n", - " \"type\": \"mmap\",\n", - " },\n", + " {\"features_dir\":\"generated_augmented_features\",\"sampling_weight\":2.0,\"penalty_weight\":1.0,\"truth\":True,\"truncation_strategy\":\"truncate_start\",\"type\":\"mmap\"},\n", + " {\"features_dir\":\"negative_datasets/speech\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", + " {\"features_dir\":\"negative_datasets/dinner_party\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", + " {\"features_dir\":\"negative_datasets/no_speech\",\"sampling_weight\":5.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", + " {\"features_dir\":\"negative_datasets/dinner_party_eval\",\"sampling_weight\":0.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"split\",\"type\":\"mmap\"},\n", "]\n", "\n", - "config[\"training_steps\"] = [40000] # Increased\n", + "config[\"training_steps\"] = [40000]\n", "config[\"positive_class_weight\"] = [1]\n", - "config[\"negative_class_weight\"] = [20] # Adjusted\n", - "config[\"learning_rates\"] = [0.001] # Adjusted\n", - "config[\"batch_size\"] = 128\n", + "config[\"negative_class_weight\"] = [20]\n", + "config[\"learning_rates\"] = [0.001]\n", "\n", - "config[\"time_mask_max_size\"] = [0] # Enabled SpecAugment\n", + "# Smaller batch to avoid GPU copy/alloc failures on 3070 laptop VRAM\n", + "config[\"batch_size\"] = 16\n", + "\n", + "# SpecAugment off (as before)\n", + "config[\"time_mask_max_size\"] = [0]\n", "config[\"time_mask_count\"] = [0]\n", "config[\"freq_mask_max_size\"] = [0]\n", "config[\"freq_mask_count\"] = [0]\n", "\n", - "config[\"eval_step_interval\"] = 500 # Adjusted\n", - "config[\"clip_duration_ms\"] = 1500 # Increased\n", - "\n", + "config[\"eval_step_interval\"] = 500\n", + "config[\"clip_duration_ms\"] = 1500\n", "config[\"target_minimization\"] = 0.9\n", - "config[\"minimization_metric\"] = None # Updated\n", + "config[\"minimization_metric\"] = None\n", "config[\"maximization_metric\"] = \"average_viable_recall\"\n", "\n", - "with open(os.path.join(\"training_parameters.yaml\"), \"w\") as file:\n", - " documents = yaml.dump(config, file)" + "with open(\"training_parameters.yaml\", \"w\") as f:\n", + " yaml.dump(config, f)\n", + "\n", + "print(\"βœ… Wrote training_parameters.yaml (batch_size=16) with allow_growth, cuda_malloc_async, XLA JIT OFF, mixed precision ON.\")" ] }, { @@ -748,42 +750,55 @@ }, "outputs": [], "source": [ - "# Trains a model. When finished, it will quantize and convert the model to a\n", - "# streaming version suitable for on-device detection.\n", - "# It will resume if stopped, but it will start over at the configured training\n", - "# steps in the yaml file.\n", - "# Change --train 0 to only convert and test the best-weighted model.\n", - "# On Google colab, it doesn't print the mini-batch results, so it may appear\n", - "# stuck for several minutes! Additionally, it is very slow compared to training\n", - "# on a local GPU.\n", + "# Train + export (GPU-friendly env + stable flags)\n", "\n", - "import os\n", - "import sys\n", + "import os, sys\n", "\n", - "# Ensure the library path is correctly set\n", - "os.environ['LD_LIBRARY_PATH'] = \"/usr/lib/x86_64-linux-gnu:\" + os.environ.get('LD_LIBRARY_PATH', '')\n", + "# --- Runtime env (inherited by the subprocess we're about to launch) ---\n", + "os.environ.setdefault(\"LD_LIBRARY_PATH\",\n", + " \"/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu:\" +\n", + " os.environ.get(\"LD_LIBRARY_PATH\",\"\")\n", + ")\n", + "os.environ.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\") # quieter logs\n", + "os.environ.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\") # grow VRAM as needed\n", + "os.environ.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")# modern allocator\n", + "os.environ.setdefault(\"XLA_FLAGS\", \"--xla_gpu_cuda_data_dir=/usr/local/cuda\")\n", + "os.environ.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable)\n", + "os.environ.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n", "\n", - "# Training command with optimized settings\n", - "!\"{sys.executable}\" -m microwakeword.model_train_eval \\\n", - "--training_config='training_parameters.yaml' \\\n", - "--train 1 \\\n", - "--restore_checkpoint 1 \\\n", - "--test_tf_nonstreaming 0 \\\n", - "--test_tflite_nonstreaming 0 \\\n", - "--test_tflite_nonstreaming_quantized 0 \\\n", - "--test_tflite_streaming 0 \\\n", - "--test_tflite_streaming_quantized 1 \\\n", - "--use_weights \"best_weights\" \\\n", - "mixednet \\\n", - "--pointwise_filters \"64,64,64,64\" \\\n", - "--repeat_in_block \"1,1,1,1\" \\\n", - "--mixconv_kernel_sizes '[5], [7,11], [9,15], [23]' \\\n", - "--residual_connection \"0,0,0,0\" \\\n", - "--first_conv_filters 32 \\\n", - "--first_conv_kernel_size 5 \\\n", - "--stride 2\n" + "# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n", + "# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n", + "\n", + "# --- Kick off training ---\n", + "cmd = f'''\"{sys.executable}\" -m microwakeword.model_train_eval \\\n", + " --training_config=\"training_parameters.yaml\" \\\n", + " --train 1 \\\n", + " --restore_checkpoint 1 \\\n", + " --test_tf_nonstreaming 0 \\\n", + " --test_tflite_nonstreaming 0 \\\n", + " --test_tflite_nonstreaming_quantized 0 \\\n", + " --test_tflite_streaming 0 \\\n", + " --test_tflite_streaming_quantized 1 \\\n", + " --use_weights \"best_weights\" \\\n", + " mixednet \\\n", + " --pointwise_filters \"64,64,64,64\" \\\n", + " --repeat_in_block \"1,1,1,1\" \\\n", + " --mixconv_kernel_sizes \"[5], [7,11], [9,15], [23]\" \\\n", + " --residual_connection \"0,0,0,0\" \\\n", + " --first_conv_filters 32 \\\n", + " --first_conv_kernel_size 5 \\\n", + " --stride 2'''\n", + "print(\"Running:\\n\", cmd)\n", + "!$cmd" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -794,22 +809,24 @@ "source": [ "import shutil\n", "import json\n", - "from IPython.display import FileLink\n", + "from IPython.display import display, HTML\n", "\n", - "# Define the source path and desired download location for the TFLite file\n", + "# Use the wake word from Cell 3\n", + "wake_word = TARGET_WORD\n", + "\n", + "# --- Copy TFLite file to working dir with wake word name ---\n", "source_path = \"trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite\"\n", - "destination_path = \"./stream_state_internal_quant.tflite\"\n", + "tflite_filename = f\"{wake_word}.tflite\"\n", + "tflite_path = f\"./{tflite_filename}\"\n", + "shutil.copy(source_path, tflite_path)\n", "\n", - "# Copy the TFLite file to the current working directory\n", - "shutil.copy(source_path, destination_path)\n", - "\n", - "# Define the JSON file content\n", + "# --- Write JSON metadata file with matching model name ---\n", "json_data = {\n", " \"type\": \"micro\",\n", - " \"wake_word\": \"hey_norman\", # Adjust this if the target_word changes dynamically\n", - " \"author\": \"master phooey\",\n", - " \"website\": \"https://github.com/MasterPhooey/MicroWakeWord-Trainer-Docker\",\n", - " \"model\": \"stream_state_internal_quant.tflite\",\n", + " \"wake_word\": wake_word,\n", + " \"author\": \"Tater Totterson\",\n", + " \"website\": \"https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git\",\n", + " \"model\": tflite_filename,\n", " \"trained_languages\": [\"en\"],\n", " \"version\": 2,\n", " \"micro\": {\n", @@ -820,20 +837,20 @@ " \"minimum_esphome_version\": \"2024.7.0\"\n", " }\n", "}\n", - "\n", - "# Define the JSON file path\n", - "json_path = \"./stream_state_internal_quant.json\"\n", - "\n", - "# Write the JSON file\n", + "json_filename = f\"{wake_word}.json\"\n", + "json_path = f\"./{json_filename}\"\n", "with open(json_path, \"w\") as json_file:\n", " json.dump(json_data, json_file, indent=2)\n", "\n", - "# Generate download links for both files\n", - "print(\"Download your files:\")\n", - "print(\"TFLite Model:\")\n", - "display(FileLink(destination_path))\n", - "print(\"\\nJSON Metadata:\")\n", - "display(FileLink(json_path))" + "# --- Display nice download links ---\n", + "html = f\"\"\"\n", + "

Download your files:

\n", + "\n", + "\"\"\"\n", + "display(HTML(html))" ] } ],