cli + web recorder ui

2026-06-12 20:10:19 -06:00 · 2026-01-17 01:23:51 -06:00
parent b700bf095c
commit 5bc0f12a7f
24 changed files with 2002 additions and 2033 deletions
--- a/cli/.DS_Store
+++ b/cli/.DS_Store
--- a/cli/.bashrc
+++ b/cli/.bashrc
@@ -1,135 +0,0 @@
-# ~/.bashrc: executed by bash(1) for non-login shells.
-# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
-# for examples
-
-# If not running interactively, don't do anything
-[ -z "$PS1" ] && return
-
-# don't put duplicate lines in the history. See bash(1) for more options
-# ... or force ignoredups and ignorespace
-HISTCONTROL=ignoredups:ignorespace
-
-# append to the history file, don't overwrite it
-shopt -s histappend
-
-# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
-HISTSIZE=1000
-HISTFILESIZE=2000
-
-# check the window size after each command and, if necessary,
-# update the values of LINES and COLUMNS.
-shopt -s checkwinsize
-
-# make less more friendly for non-text input files, see lesspipe(1)
-[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"
-
-# set variable identifying the chroot you work in (used in the prompt below)
-if [ -z "$debian_chroot" ] && [ -r /etc/debian_chroot ]; then
-    debian_chroot=$(cat /etc/debian_chroot)
-fi
-
-# set a fancy prompt (non-color, unless we know we "want" color)
-case "$TERM" in
-    xterm-color) color_prompt=yes;;
-esac
-
-# uncomment for a colored prompt, if the terminal has the capability; turned
-# off by default to not distract the user: the focus in a terminal window
-# should be on the output of commands, not on the prompt
-#force_color_prompt=yes
-
-if [ -n "$force_color_prompt" ]; then
-    if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
-	# We have color support; assume it's compliant with Ecma-48
-	# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
-	# a case would tend to support setf rather than setaf.)
-	color_prompt=yes
-    else
-	color_prompt=
-    fi
-fi
-
-if [ "$color_prompt" = yes ]; then
-    PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
-else
-    PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
-fi
-unset color_prompt force_color_prompt
-
-# If this is an xterm set the title to user@host:dir
-case "$TERM" in
-xterm*|rxvt*)
-    PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
-    ;;
-*)
-    ;;
-esac
-
-# enable color support of ls and also add handy aliases
-if [ -x /usr/bin/dircolors ]; then
-    test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
-    alias ls='ls --color=auto'
-    #alias dir='dir --color=auto'
-    #alias vdir='vdir --color=auto'
-
-    alias grep='grep --color=auto'
-    alias fgrep='fgrep --color=auto'
-    alias egrep='egrep --color=auto'
-fi
-
-# some more ls aliases
-alias ll='ls -alF'
-alias la='ls -A'
-alias l='ls -CF'
-
-# Alias definitions.
-# You may want to put all your additions into a separate file like
-# ~/.bash_aliases, instead of adding them here directly.
-# See /usr/share/doc/bash-doc/examples in the bash-doc package.
-
-if [ -f ~/.bash_aliases ]; then
-    . ~/.bash_aliases
-fi
-
-# enable programmable completion features (you don't need to enable
-# this, if it's already enabled in /etc/bash.bashrc and /etc/profile
-# sources /etc/bash.bashrc).
-#if [ -f /etc/bash_completion ] && ! shopt -oq posix; then
-#    . /etc/bash_completion
-#fi
-
-if [ -f /data/.bashrc ]; then
-    . /data/.bashrc
-fi
-
-if ! mountpoint -q /data ; then
-    cat <<-EOF >&2
-    =======================================================
-    WARNING: The /data directory is NOT mounted.
-    Running the training process without /data mounted
-    could add over 140Gb of python packages and training
-    files to this container's storage which is probably
-    NOT what you want.
-
-    You should remove this container and re-create it with
-    a 'docker run' option like '-v <host_work_dir>:/data'
-    making sure the host directory is on a device that has
-    enough free space.
-    =======================================================
-EOF
-fi
-
-if [ -d /data/.venv ]; then
-    . /data/.venv/bin/activate
-else
-    cat <<-EOF >&2
-    =======================================================
-    WARNING: A python virtual environment wasn't found
-    at /data/.venv.  You'll need to run 'setup_python_venv'
-    before you'll be able to use this container for
-    training.
-    =======================================================
-EOF
-
-fi
-alias venv='[ -d /data/.venv ] && source /data/.venv/bin/activate || echo "/data/.venv does not exist yet"'
--- a/cli/Dockerfile
+++ b/cli/Dockerfile
@@ -1,27 +0,0 @@
-# Since this is a pure python environment, we don't need to start
-# with a huge CUDA image.  A standard Ubuntu image will do.
-FROM ubuntu:24.04
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1 \
-    PIP_ROOT_USER_ACTION=ignore \
-    HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
-    PATH="/root/mww-scripts:${PATH}"
-
-# System deps
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
-    git wget curl unzip ca-certificates nano less \
- && rm -rf /var/lib/apt/lists/* \
- && mkdir -p /data
-
-COPY --chown=root:root --chmod=0755 .bashrc /root/
-COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
-        test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
-
-# Docker and Podman send the CMD a SIGTERM when you "stop" the container.  Unfortunately, bash
-# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
-# to timeout then SIGKILL the container.
-# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
-CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
--- a/cli/README.md
+++ b/cli/README.md
@@ -1,507 +0,0 @@
-# Run training from the command line
-
-## Overview
-
-With these scripts and Dockerfile, you can train new wake words from the
-command line without using a Jupyter notebook.
-
-Differences between this Docker image and the Jupyter notebook image:
-
-* The Python training environment isn't included in the image.  Instead, a
-  "virtual environment" (venv) is created in the `/data` directory which you
-   will have mounted to a host directory. This cuts about 7gb from the image
-   and allows the virtualenv to persist across container instances.
-
-* The logic from the Jupyter notebook is contained in individual Python
-  and shell scripts
-
-* No ports need to be exposed since the Jupyter notebook server isn't being
-  run.
-
-## TL;DR
-
-For the impatient among you...
-
-```shell
-$ mkdir /some/work/directory  # On a device with more than 150GB free space
-$ docker build -t microwakeword-cli:latest .
-$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
-root@mww-cli:/# cd /data
-root@mww-cli:/data# setup_python_venv
-##### You have about 4 minutes to drink coffee
-
-root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
-##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
-
-root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
-##### You have about 30-45 minutes for a nap depending on available system resources.
-##### You'll be informed of where to find your trained model.
-```
-
-Load the trained model on your device and give it a try but don't be surprized
-if you get a lot of missed or false activations.  Read on to find out why.
-
-## Get Started
-
-Good, you stuck around!  Now read the rest of the document before doing
-anything.
-
-### Using a GPU
-
-Having an Nvidia GPU available can cut the training time by up to half.  The
-open-source nouveau driver shipped with Linux kernels doesn't support CUDA
-however so if you have an Nvidia GPU and want to use it for training, you'll
-need to install the official Nvidia driver from
-https://www.nvidia.com/en-in/drivers/unix/
-
-### Build the image
-
-You can use either Docker or Podman as your container management tool.
-`docker` is used in the examples but if you have podman, just substitute
-the command.
-
-Start by navigating to the directory that contains this README file and
-the accompanying Dockerfile.  Then...
-
-
-```shell
-docker build -t microwakeword-cli:latest .
-```
-
-This should be fairly quick and result in an image that's about 320mb in size
-as it's basically a standard Ubunbtu24.04 image with a few added tools.
-
-So why isn't a pre-built image available for download?  Because it'll probably
-take longer to download a pre-built image than for you to create it locally.
-GitHub's container registry is notoriously erratic when it comes to download
-throughput.
-
-### Create a host work directory
-
-This directory will contain the Python virtual environment plus all of the
-downloaded and generated data needed for training and the final trained
-models.  A full environment will need about 150gb of free space but read
-further to see how to reduce this.
-
-Your `<host_data_dir>` will be mounted inside the container as `/data`.
-
-The training container will start a Bash shell so if you have Bash
-aliases or Bashy things you like, create a `.bashrc` file in your
-`<host_data_dir>` and put them in there.  It'll automatically be included
-any time you enter the container.
-
-### Create and start the container
-
-There are lots of options that control container creation.  The simplest example
-will create the container and give you an interactive shell.  When you exit the
-shell, the container will be stopped and removed leaving your `<host_data_dir>`
-intact.
-
-```shell
-$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
-```
-
-Options:
-
-* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
-* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
-  around and give it a name for training more than one wake word.  You
-  can stop and remove it when you're ready.
-* Add a `-d` option to start the container in the background and use `docker
-  attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
-
-When the container starts, you'll see:
-
-```text
-=======================================================
-WARNING: A python virtual environment wasn't found
-at /data/.venv.  You'll need to run setup_python_venv
-before you'll be able to use this container for
-training.
-=======================================================
-root@mww-cli:/#
-```
-
-Don't worry about the python WARNING right now.  You'll be creating the
-virtualenv in the next step.
-
-If you've forgotton to create and/or mount your host data directory, you'll
-see an additional warning:
-
-```text
-=======================================================
-WARNING: The /data directory is NOT mounted.
-Running the training process without /data mounted
-could add over 140Gb of python packages and training
-files to this container's storage which is probably
-NOT what you want.
-
-You should remove this container and re-create it with
-a 'docker run' option like '-v <host_work_dir>:/data'
-making sure the host directory is on a device that has
-enough free space.
-=======================================================
-```
-
-You can certainly continue but it's a "really bad idea"™ because your
-container storage could grow from a few hundred mb to over 140gb.
-
-At this point, you're in a Bash shell.
-
-### Create the Python virtual environment
-
-The Python virtual environment will contain all the software needed to train.
-It gets created as `/data/.venv` and will take up about 11gb of disk space.
-
-The scripts that do all the work will be in the container's PATH so to setup
-the virtual environment and install all of the packages, just run:
-
-```text
-setup_python_venv [ --verbose ]
-
-Options:
-
--verbose: Print the detailed "pip install" output.
-
-```
-
-When the installation is finished, a test of the major components will be
-run.
-
-Once the process is done, you should change to the `/data` directory and
-activate the virtual environment with:
-
-```shell
-root@mww-cli:/# cd /data
-root@mww-cli:/data# source .venv/bin/activate
-(.venv) root@mww-cli:/data#
-```
-
-Technically, you don't need to do either of these since the scripts
-are in the PATH and they know to use the `/data` directory for everything.
-It's more of an "if you're interested" thing.
-
-At this point, you have a container with all software installed.
-
-## Get the reference data
-
-The training process itself relies on a significant amount of audio reference
-data that creates a simulated "audio environment" that your wake word will be
-trained in.  These "training datasets" include things like varying amounts of
-reverberation, background music, background conversations, background noise,
-etc.  All said and done, it amounts to about 30gb of audio but with the
-downloaded archives and extracted intermediate files, you'll need about 85gb
-of free space.  Thankfully, you only need to download the files once no
-matter how many wake words you want to train and since it's stored in
-`/data`,  you can even remove the docker container and recreate it without
-losing any of it.  There are 4 datasets that are required.
-
-This is a three stage process...
-
-1.  Download zipfiles or tarballs.    (about 30gb)
-2.  Extract them.                     (about 50gb)
-3.  Convert them into the final form. (about 31gb)
-
-NOTE: The sizes add up to more than the 85gb stated earlier because one
-of the datasets doesn't need to be covnerted and is counted in both
-steps 2 and 3.  You really do only need 85gb.
-
-To download the archives, unpack them, and convert the audio to what's needed
-by the training process, run:
-
-```text
-setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
-
-Options:
--cleanup-archives:           Automatically delete the tarballs or zipfiles after
-                              they've been extracted.
-
--cleanup-intermediate-files: Automatically delete the intermediate files
-                              after they've been converted.
-
-```
-
-On a 1gb/sec Internet connection, this will take about 25 minutes.
-
-The script detects if the datasets have already been downloaded, extracted
-and/or converted and skips those steps as appropriate so if you've run the
-script without the cleanup options, you can just run it again with those
-options to clean them up.
-
-Now you're ready to train a wake word.  Almost.
-
-## Train a Wake Word
-
-Training is done in 3 stages.
-
-1.  Generate thousands of samples of the wake word with various voices,
-pitches, speeds, inflections, etc.
-2.  Augment the samples with the training datasets to add background noise, etc.
-3.  Run the Tensorflow training.
-
-### Generate a sample for verification
-
-Before you start the full process, you're going to want to generate a single
-wake word sample and play it back to ensure it sounds right.  The wake word
-should be spelled phonetically to give the sample generator the best chance
-of success.
-
-```text
-root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
-===== Generating 1 sample of 'hey buster' =====
-      Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
-      Successfully loaded the model
-      Batch 1/0 complete
-      Done
-Sample available at /data/work/test_sample/hey_buster.wav
-Play it from your host.
-```
-
-You should then play that file from your host.  The reason I used "hey buster"
-as the wake word is to demonstrate why it's important to generate and listen
-to a sample.  If you try that exact input and play it back, you'll notice
-that the generator didn't capture the "er" at the end very well. To get it to
-do so, I had to add a period on the end as a "spacer".
-"hey buster." worked much better.
-
-When you're happy with the sample, you can run the full process.
-
-### Run the full training process
-
-```text
-train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
-                [ --training-steps=<steps> ] [ --cleanup-work-dir ]
-                <wake_word> [ <wake_word_title> ]
-
-Options:
--samples:            The number of samples to generate for the wake word.
-                      Default: 20000
-
--batch-size:         How many samples should be generated at a time.  The more
-                      samples, the more memory is needed.
-                      Default: 100
-
--training-steps:     Number of training steps.  More training steps means better
-                      detection and false positive rates but also more time to train.
-                      Default: 25000
-
--cleanup-work-dir:   Delete the /data/work directory after successful training.
-                      Default: false
-
-<wake_word>           The word to train spelled phonetically.
-                      Required.
-
-<wake_word_title>     An optional pretty name to save to the json metadata file.
-                      Default: The wake word with individual words capitalized
-                               and punctuation removed.
-
-```
-
-By default, the training process creates 20,000 samples of your wake word and
-runs 25,000 training steps.  See [Tensorboard Results](#tensorboard-results)
-in the [Extra Credit](#extra-credit) section below for
-why these are the defaults.  Depending on resources available, this could take
-between 30 and 60 minutes.
-
-The resulting tflite model files and logs will be placed in the
-`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
-and will therefore be available from your host in the directory you mapped
-`/data` to.  File names will have non-filename-friendly characters in your
-wake word changed to underscores to make things easier.  You'll need both the
-tflite and json files to load on your device. Exactly how you load them
-depends on the device and is beyond the scope of this project.
-
-The only real measure of success is how well the resulting model works
-on a real device.  If you encounter too many missed or false activations,
-increasing the number of samples would probably improve the results more
-than increasing the number of training steps.  See
-[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
-
-The output from the last step is filtered some by the script but still quite
-verbose. The full log will be available in the output directory as
-`training.log` if you're interested. Intepreting the log is beyond the scope
-of this project however.
-
-You can train additional wake words or change the number of samples and
-training steps by simply running `train_wake_word` again. No need to repeat
-any of the earlier setup steps.  If you change the wake word or the number of
-wake word samples, the work directory will be deleted and all 3 steps re-run.
-If you only change the number of training steps, the data from the first two
-steps is still valid and only the 3rd step is run.
-
-All of the intermediate data is stored in the `/data/work` directory which will
-grow to about 17gb with 20,000 wake word samples.  Once the tflite model is
-successfully generated and you're happy with the results, you can delete the
-`/data/work` directory.
-
-### Training more than one wake word
-
-Once you have a container running, you
-can easily train multiple wake words from your host:
-
-```shell
-for wp in "hey_alexa" "hey_jenkins" ; do
-  docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
-done
-```
-
-### Training time examples
-
-Training times depend on lots of things.  These are examples only.
-Your Mileage May Vary!!!
-
-```text
-===============================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: N/A
-
-                 Generate 10000 samples, 100/batch Elapsed time: 0:06:17
-                             Augment 10000 samples Elapsed time: 0:04:05
-                              10000 training steps Elapsed time: 0:15:04
-                              ==================================================
-                                             Total Elapsed time: 0:25:26
-================================================================================
-
-================================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores)  Memory: 11909 mb
-
-                 Generate 10000 samples, 100/batch Elapsed time: 0:00:29
-                             Augment 10000 samples Elapsed time: 0:03:40
-                              10000 training steps Elapsed time: 0:08:00
-                          ======================================================
-                                             Total Elapsed time: 0:12:09
-================================================================================
-
-================================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: N/A
-
-                 Generate 20000 samples, 100/batch Elapsed time: 0:10:38
-                             Augment 20000 samples Elapsed time: 0:07:04
-                              25000 training steps Elapsed time: 0:25:21
-                          ======================================================
-                                             Total Elapsed time: 0:43:03
-================================================================================
-
-================================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores)  Memory: 11909 mb
-
-                 Generate 20000 samples, 100/batch Elapsed time: 0:00:53
-                             Augment 20000 samples Elapsed time: 0:07:05
-                              25000 training steps Elapsed time: 0:19:13
-                          ======================================================
-                                             Total Elapsed time: 0:27:11
-================================================================================
-
-================================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: N/A
-
-                 Generate 50000 samples, 100/batch Elapsed time: 0:30:47
-                             Augment 50000 samples Elapsed time: 0:20:22
-                              40000 training steps Elapsed time: 1:01:51
-                              ==================================================
-                                             Total Elapsed time: 1:53:00
-================================================================================
-
-================================================================================
-                            Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores)  Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores)  Memory: 11909 mb
-
-                 Generate 50000 samples, 100/batch Elapsed time: 0:02:08
-                             Augment 50000 samples Elapsed time: 0:19:13
-                              40000 training steps Elapsed time: 0:42:23
-                          ======================================================
-                                             Total Elapsed time: 1:03:44
-================================================================================
-
-
-```
-
-The sample generation process is really the only one that uses multiple CPUs so
-having fewer CPU threads available will probably make little difference.
-
-## Extra Credit
-
-### Training defaults
-
-If you plan on training multiple wake words, you can set your own default
-training parameters by creating a `/data/.defaults.env` file with the
-following contents:
-
-```shell
-# Variable names follow the command line parameters converted to upper case
-# and with the dashes ('-') converted to underscores ('_').
-export SAMPLES=10000
-export TRAINING_STEPS=10000
-
-# Don't use the GPU for any operations.  Stick with the CPU only.
-##export CUDA_VISIBLE_DEVICES=-1
-
-```
-
-### Examine your model with Tensorboard
-
-Tensorboard is a web-based graphical model viewer.  You can use it to get an
-idea of how many training steps are needed before accuracy results stop
-improving.  To use it, you'll have to expose port 6006 by adding `-p
-6006:6006` to your `docker run` command line.  If you didn't, don't worry.
-Remember, the /data directory is mapped to a directory on your host so you
-can simply stop and delete the current container and recreate it with the new
-`docker run` command. No need to re-run any of the setup or training steps.
-
-To start Tensorboard, run:
-
-```shell
-root@mww-cli:/# cd /data
-root@mww-cli:/data# source .venv/bin/activate
-(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
-```
-
-Now on your host, point your browser at `http://localhost:6006/`,
-click "SCALARS" at the top and take a look at the various charts.  You'll see
-a "train" and "validation" item for each training run you've performed.  It's
-the "train" items you're interested in.
-
-<a id="tensorboard-results"></a>
-
-You have to be a Tensorflow expert to decipher most of the charts but
-the "Accuracy" chart for this particular wake word and 50,000 samples would
-seem to idicate that there's very little improvement after about 20,000
-training steps.
-
-![Accuracy Chart, 50000 samples](tensorboard1.png)
-
-In contrast, with only 5,000 wake word samples, there's still improvement to be had after
-20,000 training steps.
-
-![Accuracy Chart, 5000 samples](tensorboard2.png)
-
-Given that it's faster to generate wake word samples than it is to train,
-20,000 samples and 25,000 training steps seems like a good compromise.  This
-chart has a bit less smoothing to show a bit more detail and includes the
-50,000 sample run as well.  This run took only 27 minutes as opposed to the
-63 minutes it took for the 50,000 sample run.  Now you know why 20,000 and
-25,000 are the defaults for these scripts.
-
-![Accuracy Chart, 25000 samples](tensorboard3.png)
-
-
-
-
-
-
--- a/cli/requirements.txt
+++ b/cli/requirements.txt
@@ -1,10 +0,0 @@
-# --- Packages needed by our scripts ---
-
-numpy==1.26.4
-scipy==1.12.0
-librosa==0.10.2.post1
-soundfile==0.12.1
-tqdm==4.67.1
-scikit-learn==1.6.0
-numba==0.63.1
-PyYAML==6.0.3
--- a/cli/setup_python_venv
+++ b/cli/setup_python_venv
@@ -1,5 +1,6 @@
 #!/bin/bash
-PROGDIR="$(dirname $(realpath $0))"
+PROGDIR="$(dirname "$(realpath "$0")")"
+ROOTDIR="$(dirname "${PROGDIR}")"

 KNOWN_ARGS=( data-dir python gpu no-gpu )
 source "${PROGDIR}/shell.functions"
@@ -27,7 +28,7 @@ EOF
    exit 1
 fi

-[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
+[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
 [ -d  "${DATA_DIR}" ] || {
    echo "Data directory '${DATA_DIR}' doesn't exist." >&2
    exit 1
@@ -52,7 +53,8 @@ if [ -n "${PYTHON}" ] ; then
    PYTHONS=( "${PYTHON}" )
    unset PYTHON
 else
-    PYTHONS=( python3.12 python3.10 )
+    # Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04)
+    PYTHONS=( python3.12 python3.11 python3.10 )
 fi

 for p in "${PYTHONS[@]}" ; do
@@ -60,14 +62,14 @@ for p in "${PYTHONS[@]}" ; do
 done

 [ -n "${PYTHON}" ] || {
-    echo "A python 3.12 or 3.10 interpreter wasn't found.  You 'll need to install one before proceeding." >&2
+    echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2
    exit 1
 }

-if [ -d  "${VENV}" ] ; then
+if [ -d "${VENV}" ] ; then
    if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
        source "${VENV}/bin/activate" || {
-            echo "Unable to activate existing virtualenv '${VENV}'.  You should delete it and try again." >&2
+            echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
            exit 1
        }
    else
@@ -82,24 +84,28 @@ if [ -z "$VIRTUAL_ENV" ] ; then
 else
    echo "   ===== Updating virtualenv at '${VENV}' ====="
 fi
+
 ${PYTHON} -m venv --upgrade-deps "${VENV}"
 source "${VENV}/bin/activate"

 set -euo pipefail

-declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
+# Symlink CLI scripts into .venv/bin
+declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) )
 progfiles+=( "${PROGDIR}/shell.functions" )

+# Also symlink the top-level entrypoint if present
+[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" )
+
 for f in "${progfiles[@]}" ; do
-    ln -sfr "${f}" ".venv/bin/$(basename ${f})"
+    ln -sfr "${f}" ".venv/bin/$(basename "${f}")"
 done

 #
-# Pip doesn't process packages from requirements.txt in
-# order but order is important because tensorflow, torch,
-# onnxruntime and micro-wake-word all depend on CUDA packages
-# at various versions. They need to be installed in this specific
-# order or they may not be able to use the GPU.
+# Pip doesn't process packages from requirements.txt in order but order is
+# important because tensorflow, torch, onnxruntime and micro-wake-word all
+# depend on CUDA packages at various versions. They need to be installed in
+# this specific order or they may not be able to use the GPU.
 #
 export PIP_PROGRESS_BAR=off
 export PIP_NO_COLOR=1
@@ -117,7 +123,8 @@ pip_install() {
 START_TS=$EPOCHSECONDS

 echo "   ===== Installing common requirements ====="
-pip_install -r "${PROGDIR}/requirements.txt"
+# requirements.txt lives in repo root now
+pip_install -r "${ROOTDIR}/requirements.txt"

 ${GPU} && tfgpu='[and-cuda]' || tfgpu=""
 echo "   ===== Installing Tensorflow${tfgpu} ====="
@@ -140,7 +147,7 @@ pip_install -e "${MWW}"

 echo "   ===== Checking piper-sample-generator ====="
 PSG="${DATA_DIR}/tools/piper-sample-generator"
-if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
+if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then
    rm -rf "${PSG}" || :
    echo "   Cloning piper-sample-generator to ${DATA_DIR}/tools"
    git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
@@ -171,13 +178,11 @@ echo "   ===== Installing keras ====="
 # keras 3.13 has "issues" so we need to back down to 3.12.
 pip_install "keras==3.12.0"

-${PROGDIR}/test_python --data-dir="${DATA_DIR}"
+"${PROGDIR}/test_python" --data-dir="${DATA_DIR}"

 touch .mww-data-dir
 END_TS=$EPOCHSECONDS

 echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."

-print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
-
-
+print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
--- a/cli/setup_training_datasets
+++ b/cli/setup_training_datasets
@@ -1,8 +1,9 @@
 #!/bin/bash
 set -euo pipefail

-PROGPATH=$(realpath "$0")
-PROGDIR=$(dirname "${PROGPATH}")
+PROGPATH="$(realpath "$0")"
+PROGDIR="$(dirname "${PROGPATH}")"
+ROOTDIR="$(dirname "${PROGDIR}")"  # repo root (train_wake_word, requirements.txt, etc.)

 KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
 source "${PROGDIR}/shell.functions"
@@ -27,22 +28,38 @@ EOF
    exit 1
 fi

+# Normalize + validate DATA_DIR (shell.functions typically sets a default,
+# but this makes the script standalone-safe)
+[ -n "${DATA_DIR:-}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
+[ -d "${DATA_DIR}" ] || {
+    echo "Data directory '${DATA_DIR}' doesn't exist." >&2
+    exit 1
+}
+
 cd "${DATA_DIR}"

 START_TS=$EPOCHSECONDS
 echo -e "\n===== Setting up Training Datasets =====\n"

-${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
-    --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_negative_datasets" \
+    --cleanup-archives="${CLEANUP_ARCHIVES}" \
+    --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+    --data-dir="${DATA_DIR}"

-${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
-    --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_mit_audio" \
+    --cleanup-archives="${CLEANUP_ARCHIVES}" \
+    --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+    --data-dir="${DATA_DIR}"

-${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
-    --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_audioset" \
+    --cleanup-archives="${CLEANUP_ARCHIVES}" \
+    --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+    --data-dir="${DATA_DIR}"

-${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
-    --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_fma" \
+    --cleanup-archives="${CLEANUP_ARCHIVES}" \
+    --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+    --data-dir="${DATA_DIR}"

-END_TS=$(date +%s.%N)
+END_TS=$EPOCHSECONDS
 print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
--- a/cli/tensorboard1.png
+++ b/cli/tensorboard1.png
--- a/cli/tensorboard2.png
+++ b/cli/tensorboard2.png
--- a/cli/tensorboard3.png
+++ b/cli/tensorboard3.png
--- a/cli/train_wake_word
+++ b/cli/train_wake_word
@@ -1,125 +0,0 @@
-#!/bin/bash
-set -e
-
-PROGPATH=$(realpath "$0")
-PROGDIR=$(dirname "${PROGPATH}")
-
-KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
-source "${PROGDIR}/shell.functions"
-WAKE_WORD=${POSITIONAL_ARGS[0]}
-
-if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
-    echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
-    HELP=true
-fi
-
-if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
-    cat <<EOF >&2
-Usage: train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
-                       [ --training-steps=<steps> ] [ --cleanup-work-dir ]
-                       <wake_word> [ <wake_word_title> ]
-
-Options:
--samples:            The number of samples to generate for the wake word.
-                      Default: ${DEFAULT_SAMPLES}
-
--batch-size:         How many samples should be generated at a time.  The more
-                      samples per batch, the more memory is needed.
-                      Default: ${DEFAULT_BATCH_SIZE}
-
--training-steps:     Number of training steps.  More training steps means better
-                      detection and false positive rates but also more time to train.
-                      Default: ${DEFAULT_TRAINING_STEPS}
-
--cleanup-work-dir:   Delete the /data/work directory after successful training.
-                      Default: false
-
-<wake_word>           The word to train spelled phonetically.
-                      Required.
-
-<wake_word_title>     An optional pretty name to save to the json metadata file.
-                      Default: The wake word with individual words capitalized
-                               and punctuation removed.
-
-EOF
-    exit 1
-fi
-
-# shellcheck source=/dev/null
-source "${DATA_DIR}/.venv/bin/activate"
-
-cd "${DATA_DIR}"
-mkdir -p "${DATA_DIR}/work" || :
-
-[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" || :
-
-if [ ! -v WAKE_WORD_TITLE ] ; then
-    declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
-    WAKE_WORD_TITLE="${WWNA[*]^}"
-elif [ -z "$WAKE_WORD_TITLE" ] ; then
-    WAKE_WORD_TITLE="$WAKE_WORD"
-fi
-
-printf "%-80s\n" "=" | tr ' ' "="
-echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
-"${PROGDIR}/cudainfo"
-echo
-START_TS=$EPOCHSECONDS
-
-export TF_CPP_MIN_LOG_LEVEL=9
-export TF_FORCE_GPU_ALLOW_GROWTH=true
-export TF_GPU_ALLOCATOR=cuda_malloc_async
-export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
-export NVIDIA_TF32_OVERRIDE=1
-export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
-export GLOG_minloglevel=2
-export GRPC_VERBOSITY=ERROR
-
-
-"${PROGDIR}/wake_word_sample_generator" \
-    --samples=${SAMPLES} \
-    --batch-size=${BATCH_SIZE} \
-    --data-dir="${DATA_DIR}" "${WAKE_WORD}"
-
-POST_GEN_TS=$EPOCHSECONDS
-
-ww="${WAKE_WORD// /_}"
-ww="${ww//./}"
-
-AUGMENT=false
-GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
-AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
-
-[ -d "${AUGMENTED_DIR}" ] || AUGMENT=true
-[ "${GENERATED_DIR}/0.wav" -nt "${AUGMENTED_DIR}/testing/wakeword_mmap/data.ninja" ] && AUGMENT=true || :
-
-if ${AUGMENT} ; then
-    rm -rf "${AUGMENTED_DIR}" || :
-    mkdir -p "${AUGMENTED_DIR}" || :
-    "${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
-else
-    echo "Augmentation not required"
-    echo
-fi
-
-POST_AUGMENT_TS=$EPOCHSECONDS
-
-"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \
-        "${WAKE_WORD}" "${WAKE_WORD_TITLE}"
-
-if ${CLEANUP_WORK_DIR} ; then
-    rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \
-    "${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || :
-fi
-END_TS=$EPOCHSECONDS
-
-python -c $'print(f"{\'=\' * 80}")'
-printf "%44s\n\n" "Training Summary"
-"${PROGDIR}/system_summary"
-echo
-print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
-print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
-print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps"
-python -c $'msg="="*54 ; print(f"{msg:>80s}")'
-print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total"
-python -c $'print(f"{\'=\' * 80}")'
--- a/cli/wake_word_sample_augmenter
+++ b/cli/wake_word_sample_augmenter
--- a/cli/wake_word_sample_trainer
+++ b/cli/wake_word_sample_trainer