diff --git a/cli/.bashrc b/cli/.bashrc new file mode 100644 index 0000000..922c4c1 --- /dev/null +++ b/cli/.bashrc @@ -0,0 +1,135 @@ +# ~/.bashrc: executed by bash(1) for non-login shells. +# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc) +# for examples + +# If not running interactively, don't do anything +[ -z "$PS1" ] && return + +# don't put duplicate lines in the history. See bash(1) for more options +# ... or force ignoredups and ignorespace +HISTCONTROL=ignoredups:ignorespace + +# append to the history file, don't overwrite it +shopt -s histappend + +# for setting history length see HISTSIZE and HISTFILESIZE in bash(1) +HISTSIZE=1000 +HISTFILESIZE=2000 + +# check the window size after each command and, if necessary, +# update the values of LINES and COLUMNS. +shopt -s checkwinsize + +# make less more friendly for non-text input files, see lesspipe(1) +[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)" + +# set variable identifying the chroot you work in (used in the prompt below) +if [ -z "$debian_chroot" ] && [ -r /etc/debian_chroot ]; then + debian_chroot=$(cat /etc/debian_chroot) +fi + +# set a fancy prompt (non-color, unless we know we "want" color) +case "$TERM" in + xterm-color) color_prompt=yes;; +esac + +# uncomment for a colored prompt, if the terminal has the capability; turned +# off by default to not distract the user: the focus in a terminal window +# should be on the output of commands, not on the prompt +#force_color_prompt=yes + +if [ -n "$force_color_prompt" ]; then + if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then + # We have color support; assume it's compliant with Ecma-48 + # (ISO/IEC-6429). (Lack of such support is extremely rare, and such + # a case would tend to support setf rather than setaf.) + color_prompt=yes + else + color_prompt= + fi +fi + +if [ "$color_prompt" = yes ]; then + PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +else + PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ ' +fi +unset color_prompt force_color_prompt + +# If this is an xterm set the title to user@host:dir +case "$TERM" in +xterm*|rxvt*) + PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1" + ;; +*) + ;; +esac + +# enable color support of ls and also add handy aliases +if [ -x /usr/bin/dircolors ]; then + test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)" + alias ls='ls --color=auto' + #alias dir='dir --color=auto' + #alias vdir='vdir --color=auto' + + alias grep='grep --color=auto' + alias fgrep='fgrep --color=auto' + alias egrep='egrep --color=auto' +fi + +# some more ls aliases +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' + +# Alias definitions. +# You may want to put all your additions into a separate file like +# ~/.bash_aliases, instead of adding them here directly. +# See /usr/share/doc/bash-doc/examples in the bash-doc package. + +if [ -f ~/.bash_aliases ]; then + . ~/.bash_aliases +fi + +# enable programmable completion features (you don't need to enable +# this, if it's already enabled in /etc/bash.bashrc and /etc/profile +# sources /etc/bash.bashrc). +#if [ -f /etc/bash_completion ] && ! shopt -oq posix; then +# . /etc/bash_completion +#fi + +if [ -f /data/.bashrc ]; then + . /data/.bashrc +fi + +if ! mountpoint -q /data ; then + cat <<-EOF >&2 + ======================================================= + WARNING: The /data directory is NOT mounted. + Running the training process without /data mounted + could add over 140Gb of python packages and training + files to this container's storage which is probably + NOT what you want. + + You should remove this container and re-create it with + a 'docker run' option like '-v :/data' + making sure the host directory is on a device that has + enough free space. + ======================================================= +EOF +fi + +if [ -d /data/.venv ]; then + . /data/.venv/bin/activate +else + cat <<-EOF >&2 + ======================================================= + WARNING: A python virtual environment wasn't found + at /data/.venv. You'll need to run 'setup_python_venv' + before you'll be able to use this container for + training. + ======================================================= +EOF + +fi +alias venv='[ -d /data/.venv ] && source /data/.venv/bin/activate || echo "/data/.venv does not exist yet"' diff --git a/cli/Dockerfile b/cli/Dockerfile new file mode 100644 index 0000000..c460d93 --- /dev/null +++ b/cli/Dockerfile @@ -0,0 +1,27 @@ +# Since this is a pure python environment, we don't need to start +# with a huge CUDA image. A standard Ubuntu image will do. +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_ROOT_USER_ACTION=ignore \ + HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + PATH="/root/mww-scripts:${PATH}" + +# System deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \ + git wget curl unzip ca-certificates nano less \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /data + +COPY --chown=root:root --chmod=0755 .bashrc /root/ +COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \ + test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/ + +# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash +# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop" +# to timeout then SIGKILL the container. +# This little scriptlet causes bash to exit immediately when it receives the SIGTERM. +CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ] diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000..359d73d --- /dev/null +++ b/cli/README.md @@ -0,0 +1,507 @@ +# Run training from the command line + +## Overview + +With these scripts and Dockerfile, you can train new wake words from the +command line without using a Jupyter notebook. + +Differences between this Docker image and the Jupyter notebook image: + +* The Python training environment isn't included in the image. Instead, a + "virtual environment" (venv) is created in the `/data` directory which you + will have mounted to a host directory. This cuts about 7gb from the image + and allows the virtualenv to persist across container instances. + +* The logic from the Jupyter notebook is contained in individual Python + and shell scripts + +* No ports need to be exposed since the Jupyter notebook server isn't being + run. + +## TL;DR + +For the impatient among you... + +```shell +$ mkdir /some/work/directory # On a device with more than 150GB free space +$ docker build -t microwakeword-cli:latest . +$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest +root@mww-cli:/# cd /data +root@mww-cli:/data# setup_python_venv +##### You have about 4 minutes to drink coffee + +root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files +##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection) + +root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word" +##### You have about 30-45 minutes for a nap depending on available system resources. +##### You'll be informed of where to find your trained model. +``` + +Load the trained model on your device and give it a try but don't be surprized +if you get a lot of missed or false activations. Read on to find out why. + +## Get Started + +Good, you stuck around! Now read the rest of the document before doing +anything. + +### Using a GPU + +Having an Nvidia GPU available can cut the training time by up to half. The +open-source nouveau driver shipped with Linux kernels doesn't support CUDA +however so if you have an Nvidia GPU and want to use it for training, you'll +need to install the official Nvidia driver from +https://www.nvidia.com/en-in/drivers/unix/ + +### Build the image + +You can use either Docker or Podman as your container management tool. +`docker` is used in the examples but if you have podman, just substitute +the command. + +Start by navigating to the directory that contains this README file and +the accompanying Dockerfile. Then... + + +```shell +docker build -t microwakeword-cli:latest . +``` + +This should be fairly quick and result in an image that's about 320mb in size +as it's basically a standard Ubunbtu24.04 image with a few added tools. + +So why isn't a pre-built image available for download? Because it'll probably +take longer to download a pre-built image than for you to create it locally. +GitHub's container registry is notoriously erratic when it comes to download +throughput. + +### Create a host work directory + +This directory will contain the Python virtual environment plus all of the +downloaded and generated data needed for training and the final trained +models. A full environment will need about 150gb of free space but read +further to see how to reduce this. + +Your `` will be mounted inside the container as `/data`. + +The training container will start a Bash shell so if you have Bash +aliases or Bashy things you like, create a `.bashrc` file in your +`` and put them in there. It'll automatically be included +any time you enter the container. + +### Create and start the container + +There are lots of options that control container creation. The simplest example +will create the container and give you an interactive shell. When you exit the +shell, the container will be stopped and removed leaving your `` +intact. + +```shell +$ docker run -it --rm --gpus=all -v :/data microwakeword-cli:latest +``` + +Options: + +* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it. +* Remove the `--rm` and add a `--name=mww-cli` option to keep the container + around and give it a name for training more than one wake word. You + can stop and remove it when you're ready. +* Add a `-d` option to start the container in the background and use `docker + attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it. + +When the container starts, you'll see: + +```text +======================================================= +WARNING: A python virtual environment wasn't found +at /data/.venv. You'll need to run setup_python_venv +before you'll be able to use this container for +training. +======================================================= +root@mww-cli:/# +``` + +Don't worry about the python WARNING right now. You'll be creating the +virtualenv in the next step. + +If you've forgotton to create and/or mount your host data directory, you'll +see an additional warning: + +```text +======================================================= +WARNING: The /data directory is NOT mounted. +Running the training process without /data mounted +could add over 140Gb of python packages and training +files to this container's storage which is probably +NOT what you want. + +You should remove this container and re-create it with +a 'docker run' option like '-v :/data' +making sure the host directory is on a device that has +enough free space. +======================================================= +``` + +You can certainly continue but it's a "really bad idea"™ because your +container storage could grow from a few hundred mb to over 140gb. + +At this point, you're in a Bash shell. + +### Create the Python virtual environment + +The Python virtual environment will contain all the software needed to train. +It gets created as `/data/.venv` and will take up about 11gb of disk space. + +The scripts that do all the work will be in the container's PATH so to setup +the virtual environment and install all of the packages, just run: + +```text +setup_python_venv [ --verbose ] + +Options: + +--verbose: Print the detailed "pip install" output. + +``` + +When the installation is finished, a test of the major components will be +run. + +Once the process is done, you should change to the `/data` directory and +activate the virtual environment with: + +```shell +root@mww-cli:/# cd /data +root@mww-cli:/data# source .venv/bin/activate +(.venv) root@mww-cli:/data# +``` + +Technically, you don't need to do either of these since the scripts +are in the PATH and they know to use the `/data` directory for everything. +It's more of an "if you're interested" thing. + +At this point, you have a container with all software installed. + +## Get the reference data + +The training process itself relies on a significant amount of audio reference +data that creates a simulated "audio environment" that your wake word will be +trained in. These "training datasets" include things like varying amounts of +reverberation, background music, background conversations, background noise, +etc. All said and done, it amounts to about 30gb of audio but with the +downloaded archives and extracted intermediate files, you'll need about 85gb +of free space. Thankfully, you only need to download the files once no +matter how many wake words you want to train and since it's stored in +`/data`, you can even remove the docker container and recreate it without +losing any of it. There are 4 datasets that are required. + +This is a three stage process... + +1. Download zipfiles or tarballs. (about 30gb) +2. Extract them. (about 50gb) +3. Convert them into the final form. (about 31gb) + +NOTE: The sizes add up to more than the 85gb stated earlier because one +of the datasets doesn't need to be covnerted and is counted in both +steps 2 and 3. You really do only need 85gb. + +To download the archives, unpack them, and convert the audio to what's needed +by the training process, run: + +```text +setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ] + +Options: +--cleanup-archives: Automatically delete the tarballs or zipfiles after + they've been extracted. + +--cleanup-intermediate-files: Automatically delete the intermediate files + after they've been converted. + +``` + +On a 1gb/sec Internet connection, this will take about 25 minutes. + +The script detects if the datasets have already been downloaded, extracted +and/or converted and skips those steps as appropriate so if you've run the +script without the cleanup options, you can just run it again with those +options to clean them up. + +Now you're ready to train a wake word. Almost. + +## Train a Wake Word + +Training is done in 3 stages. + +1. Generate thousands of samples of the wake word with various voices, +pitches, speeds, inflections, etc. +2. Augment the samples with the training datasets to add background noise, etc. +3. Run the Tensorflow training. + +### Generate a sample for verification + +Before you start the full process, you're going to want to generate a single +wake word sample and play it back to ensure it sounds right. The wake word +should be spelled phonetically to give the sample generator the best chance +of success. + +```text +root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster" +===== Generating 1 sample of 'hey buster' ===== + Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt + Successfully loaded the model + Batch 1/0 complete + Done +Sample available at /data/work/test_sample/hey_buster.wav +Play it from your host. +``` + +You should then play that file from your host. The reason I used "hey buster" +as the wake word is to demonstrate why it's important to generate and listen +to a sample. If you try that exact input and play it back, you'll notice +that the generator didn't capture the "er" at the end very well. To get it to +do so, I had to add a period on the end as a "spacer". +"hey buster." worked much better. + +When you're happy with the sample, you can run the full process. + +### Run the full training process + +```text +train_wake_word [ --samples= ] [ --batch-size= ] + [ --training-steps= ] [ --cleanup-work-dir ] + [ ] + +Options: +--samples: The number of samples to generate for the wake word. + Default: 20000 + +--batch-size: How many samples should be generated at a time. The more + samples, the more memory is needed. + Default: 100 + +--training-steps: Number of training steps. More training steps means better + detection and false positive rates but also more time to train. + Default: 25000 + +--cleanup-work-dir: Delete the /data/work directory after successful training. + Default: false + + The word to train spelled phonetically. + Required. + + An optional pretty name to save to the json metadata file. + Default: The wake word with individual words capitalized + and punctuation removed. + +``` + +By default, the training process creates 20,000 samples of your wake word and +runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results) +in the [Extra Credit](#extra-credit) section below for +why these are the defaults. Depending on resources available, this could take +between 30 and 60 minutes. + +The resulting tflite model files and logs will be placed in the +`/data/output/---` directory +and will therefore be available from your host in the directory you mapped +`/data` to. File names will have non-filename-friendly characters in your +wake word changed to underscores to make things easier. You'll need both the +tflite and json files to load on your device. Exactly how you load them +depends on the device and is beyond the scope of this project. + +The only real measure of success is how well the resulting model works +on a real device. If you encounter too many missed or false activations, +increasing the number of samples would probably improve the results more +than increasing the number of training steps. See +[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below. + +The output from the last step is filtered some by the script but still quite +verbose. The full log will be available in the output directory as +`training.log` if you're interested. Intepreting the log is beyond the scope +of this project however. + +You can train additional wake words or change the number of samples and +training steps by simply running `train_wake_word` again. No need to repeat +any of the earlier setup steps. If you change the wake word or the number of +wake word samples, the work directory will be deleted and all 3 steps re-run. +If you only change the number of training steps, the data from the first two +steps is still valid and only the 3rd step is run. + +All of the intermediate data is stored in the `/data/work` directory which will +grow to about 17gb with 20,000 wake word samples. Once the tflite model is +successfully generated and you're happy with the results, you can delete the +`/data/work` directory. + +### Training more than one wake word + +Once you have a container running, you +can easily train multiple wake words from your host: + +```shell +for wp in "hey_alexa" "hey_jenkins" ; do + docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp" +done +``` + +### Training time examples + +Training times depend on lots of things. These are examples only. +Your Mileage May Vary!!! + +```text +=============================================================================== + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 10000 samples, 100/batch Elapsed time: 0:06:17 + Augment 10000 samples Elapsed time: 0:04:05 + 10000 training steps Elapsed time: 0:15:04 + ================================================== + Total Elapsed time: 0:25:26 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 10000 samples, 100/batch Elapsed time: 0:00:29 + Augment 10000 samples Elapsed time: 0:03:40 + 10000 training steps Elapsed time: 0:08:00 + ====================================================== + Total Elapsed time: 0:12:09 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 20000 samples, 100/batch Elapsed time: 0:10:38 + Augment 20000 samples Elapsed time: 0:07:04 + 25000 training steps Elapsed time: 0:25:21 + ====================================================== + Total Elapsed time: 0:43:03 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 20000 samples, 100/batch Elapsed time: 0:00:53 + Augment 20000 samples Elapsed time: 0:07:05 + 25000 training steps Elapsed time: 0:19:13 + ====================================================== + Total Elapsed time: 0:27:11 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 50000 samples, 100/batch Elapsed time: 0:30:47 + Augment 50000 samples Elapsed time: 0:20:22 + 40000 training steps Elapsed time: 1:01:51 + ================================================== + Total Elapsed time: 1:53:00 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 50000 samples, 100/batch Elapsed time: 0:02:08 + Augment 50000 samples Elapsed time: 0:19:13 + 40000 training steps Elapsed time: 0:42:23 + ====================================================== + Total Elapsed time: 1:03:44 +================================================================================ + + +``` + +The sample generation process is really the only one that uses multiple CPUs so +having fewer CPU threads available will probably make little difference. + +## Extra Credit + +### Training defaults + +If you plan on training multiple wake words, you can set your own default +training parameters by creating a `/data/.defaults.env` file with the +following contents: + +```shell +# Variable names follow the command line parameters converted to upper case +# and with the dashes ('-') converted to underscores ('_'). +export SAMPLES=10000 +export TRAINING_STEPS=10000 + +# Don't use the GPU for any operations. Stick with the CPU only. +##export CUDA_VISIBLE_DEVICES=-1 + +``` + +### Examine your model with Tensorboard + +Tensorboard is a web-based graphical model viewer. You can use it to get an +idea of how many training steps are needed before accuracy results stop +improving. To use it, you'll have to expose port 6006 by adding `-p +6006:6006` to your `docker run` command line. If you didn't, don't worry. +Remember, the /data directory is mapped to a directory on your host so you +can simply stop and delete the current container and recreate it with the new +`docker run` command. No need to re-run any of the setup or training steps. + +To start Tensorboard, run: + +```shell +root@mww-cli:/# cd /data +root@mww-cli:/data# source .venv/bin/activate +(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output +``` + +Now on your host, point your browser at `http://localhost:6006/`, +click "SCALARS" at the top and take a look at the various charts. You'll see +a "train" and "validation" item for each training run you've performed. It's +the "train" items you're interested in. + + + +You have to be a Tensorflow expert to decipher most of the charts but +the "Accuracy" chart for this particular wake word and 50,000 samples would +seem to idicate that there's very little improvement after about 20,000 +training steps. + +![Accuracy Chart, 50000 samples](tensorboard1.png) + +In contrast, with only 5,000 wake word samples, there's still improvement to be had after +20,000 training steps. + +![Accuracy Chart, 5000 samples](tensorboard2.png) + +Given that it's faster to generate wake word samples than it is to train, +20,000 samples and 25,000 training steps seems like a good compromise. This +chart has a bit less smoothing to show a bit more detail and includes the +50,000 sample run as well. This run took only 27 minutes as opposed to the +63 minutes it took for the 50,000 sample run. Now you know why 20,000 and +25,000 are the defaults for these scripts. + +![Accuracy Chart, 25000 samples](tensorboard3.png) + + + + + + diff --git a/cli/cudainfo b/cli/cudainfo new file mode 100755 index 0000000..5c3164b --- /dev/null +++ b/cli/cudainfo @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +import sys, glob + +devices = glob.glob("/dev/nvidia[0-9]") +if len(devices) == 0: + print("CUDA not available or no CUDA-capable GPU found.") + sys.exit(0) + + +cc_cores_per_SM_dict = { + (2,0) : 32, + (2,1) : 48, + (3,0) : 192, + (3,5) : 192, + (3,7) : 192, + (5,0) : 128, + (5,2) : 128, + (6,0) : 64, + (6,1) : 128, + (7,0) : 64, + (7,5) : 64, + (8,0) : 64, + (8,6) : 128, + (8,9) : 128, + (9,0) : 128, + (10,0) : 128, + (12,0) : 128 + } + +try: + from numba import cuda + device = cuda.get_current_device() + ctx = cuda.current_context() + meminfo = ctx.get_memory_info() + compute_capability = device.compute_capability + sms = getattr(device, 'MULTIPROCESSOR_COUNT') + cores_per_sm = cc_cores_per_SM_dict.get(compute_capability) + if not cores_per_sm: + cores_per_sm = "unknown" + total_cores = "unknown" + else: + total_cores = cores_per_sm * sms + + print(f" GPU Name: {device.name if type(device.name) is str else device.name.decode()}") + print(f" Compute Capability: {'.'.join(list(map(str, compute_capability))):>7}") + print(f"Streaming Multiprocessors: {sms:>7}") + print(f" CUDA Cores per SM: {cores_per_sm:>7}") + print(f" Total CUDA Cores: {total_cores:>7}") + print(f" Total Memory: {meminfo.total / 1024 / 1024:>7.0f} mb") + print(f" Free Memory: {meminfo.free / 1024 / 1024:>7.0f} mb") +except Exception as e: + print("CUDA not available or no CUDA-capable GPU found.") diff --git a/cli/requirements.txt b/cli/requirements.txt new file mode 100644 index 0000000..a0e801b --- /dev/null +++ b/cli/requirements.txt @@ -0,0 +1,10 @@ +# --- Packages needed by our scripts --- + +numpy==1.26.4 +scipy==1.12.0 +librosa==0.10.2.post1 +soundfile==0.12.1 +tqdm==4.67.1 +scikit-learn==1.6.0 +numba==0.63.1 +PyYAML==6.0.3 diff --git a/cli/setup_audioset b/cli/setup_audioset new file mode 100755 index 0000000..d92552d --- /dev/null +++ b/cli/setup_audioset @@ -0,0 +1,175 @@ +#!/bin/bash +set -euo pipefail + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +source "${PROGDIR}/shell.functions" + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir= ] + + --cleanup-archives : Automatically clean up any downloaded archvies after + extraction. + --cleanup-intermediate-files + : Automatically clean up the intermediate files after they've + : converted to 16k. + : Path to the data directory. + : Default: ${DATA_DIR} + +EOF + exit 1 +fi + +mkdir -p "${DATA_DIR}/training_datasets/downloads" || : +cd "${DATA_DIR}/training_datasets" + +echo "***** Checking audioset *****" + +AUDIO_URL="https://huggingface.co/datasets/agkphysics/AudioSet/resolve" +AUDIO_DIR="./audioset" +mkdir -p "${AUDIO_DIR}" +AUDIO16K_DIR="./audioset_16k" +mkdir -p "${AUDIO16K_DIR}" +AUDIO_FILECOUNT="./downloads/audioset_filecount" +AUDIO_IN_GLOB="*.flac" + +declare -A filecounts +for i in {0..9} ; do + fname="bal_train0${i}.tar" + filecounts[${fname}]=0 +done + +get_filecounts filecounts "${AUDIO_FILECOUNT}" + + +REV_CANDIDATES=( + "6762f044d1c88619c7f2006486036192128fb07e" + "0049167e89f259a010c3f070fe3666d9e5242836" + "ceb9eaaa7844c9ad7351e659c84a572e376ad06d" + "main" +) + +TAR_PATTERNS=( + "data/bal_train0" + "data/bal_train/bal_train0" +) + +find_rev() { + for rev in "${REV_CANDIDATES[@]}" ; do + for pattern in "${TAR_PATTERNS[@]}" ; do + url="https://huggingface.co/datasets/agkphysics/AudioSet/resolve/${rev}/${pattern}0.tar" + curl -I -L --fail -s "${url}" > /dev/null && echo "${rev},${pattern}" + done + done + echo "" +} + +converter() { + source ${DATA_DIR}/.venv/bin/activate + python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF +import os, sys, subprocess, scipy.io.wavfile, numpy as np +from pathlib import Path +import soundfile as sf +import librosa +from tqdm import tqdm + +def write_wav(dst: Path, data: np.ndarray, sr: int): + x = np.clip(data, -1.0, 1.0) + scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16)) + +audioset_dir = Path(sys.argv[1]) +audioset_out = Path(sys.argv[2]) + +# convert FLAC → 16k mono WAV +flacs = list(audioset_dir.rglob("*.flac")) +print(f" FLAC files: {len(flacs)}") +audioset_bad = [] +ok = 0 +for p in tqdm(flacs, desc=" AudioSet→WAV (resample 16k mono)"): + try: + outfile = Path(audioset_out / (p.stem + ".wav")) + if outfile.exists(): + continue + y, _ = librosa.load(p, sr=16000, mono=True) + if y.size == 0: + raise ValueError("empty audio") + write_wav(outfile, y, 16000) + ok += 1 + except Exception as e: + audioset_bad.append(f"{p}:{e}") + +if audioset_bad: + (audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad)) +print(f" AudioSet complete ({ok} ok, {len(audioset_bad)} failed)") +EOF +} + +expected_filecount=$(get_total_filecount filecounts) +actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || : +write_filecount=false + +if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then + echo " Existing Audioset valid" +else + dl=$(find_rev) + [ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; } + rev=${dl%%,*} + pattern=${dl##*,} + echo " Checking 10 tarballs" + for i in {0..9} ; do + fname="downloads/bal_train0${i}.tar" + if [ ! -f "${fname}" ] ; then + echo " Downloading bal_train0${i}.tar" + url="${AUDIO_URL}/${rev}/${pattern}${i}.tar" + curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; } + fi + + tarball_filecount=$(tar -tvf "${fname}" | wc -l ) + filecounts["bal_train0${i}.tar"]=${tarball_filecount} + write_filecount=true + + echo " Untarring bal_train0${i}.tar" + tar -xf "${fname}" -C "${AUDIO_DIR}" + if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then + echo " Cleaning up bal_train0${i}.tar" + rm -rf "${fname}" + fi + done + rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || : + converter + if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then + failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l) + filecounts[failed]=-${failed} + fi + expected_filecount=$(get_total_filecount filecounts) + actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || : + if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then + echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2 + exit 1 + fi +fi + +if ${write_filecount} ; then + write_filecounts filecounts "${AUDIO_FILECOUNT}" +fi + +if "${CLEANUP_ARCHIVES}" ; then + for i in {0..9} ; do + fname="downloads/bal_train0${i}.tar" + if [ -f "${fname}" ] ; then + echo " Cleaning up bal_train0${i}.tar" + rm -rf "${fname}" + fi + done +fi + +if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then + echo " Cleaning up ${AUDIO_DIR}" + rm -rf "${AUDIO_DIR}" +fi + +echo " Audioset complete" +exit 0 + diff --git a/cli/setup_fma b/cli/setup_fma new file mode 100755 index 0000000..fe7f090 --- /dev/null +++ b/cli/setup_fma @@ -0,0 +1,131 @@ +#!/bin/bash +set -euo pipefail + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +source "${PROGDIR}/shell.functions" + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir= ] + + --cleanup-archives : Automatically clean up any downloaded archvies after + extraction. + --cleanup-intermediate-files + : Automatically clean up the intermediate files after they've + : converted to 16k. + : Path to the data directory. + : Default: ${DATA_DIR} + +EOF + exit 1 +fi + +mkdir -p "${DATA_DIR}/training_datasets/downloads" || : +cd "${DATA_DIR}/training_datasets" + +echo "***** Checking FMA *****" + +AUDIO_URL="https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip" +AUDIO_ZIPFILE="fma_xs.zip" +AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" +AUDIO_DIR="fma" +mkdir -p "${AUDIO_DIR}" || : +AUDIO16K_DIR="fma_16k" +mkdir -p "${AUDIO16K_DIR}" || : +AUDIO_FILECOUNT="./downloads/fma_filecount" +AUDIO_IN_GLOB="*.mp3" + +declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 ) +get_filecounts filecounts "${AUDIO_FILECOUNT}" + +converter() { + source ${DATA_DIR}/.venv/bin/activate + python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF +import os, sys, subprocess, scipy.io.wavfile, numpy as np +from pathlib import Path +import soundfile as sf +import librosa +from tqdm import tqdm + +def write_wav(dst: Path, data: np.ndarray, sr: int): + x = np.clip(data, -1.0, 1.0) + scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16)) + +fma_dir = Path(sys.argv[1]) +fma_out = Path(sys.argv[2]) + +# convert MP3 → 16k mono WAV +mp3s = list(fma_dir.rglob("*.mp3")) +print(f" MP3 files: {len(mp3s)}") +fma_bad = [] +ok = 0 +for p in tqdm(mp3s, desc=" FMA→WAV (resample 16k mono)"): + try: + outfile = Path(fma_out / (p.stem + ".wav")) + if outfile.exists(): + continue + y, _ = librosa.load(p, sr=16000, mono=True) + if y.size == 0: + raise ValueError("empty audio") + write_wav(outfile, y, 16000) + ok += 1 + except Exception as e: + fma_bad.append(f"{p}:{e}") + +if fma_bad: + (fma_out / "fma_corrupted_files.log").write_text("\n".join(fma_bad)) +print(f" FMA complete ({ok} ok, {len(fma_bad)} failed)") +EOF + +} + +expected_filecount=${filecounts[${AUDIO_ZIPFILE}]} +actual_filecount=$(find ${AUDIO16K_DIR} -name '*.wav' 2>/dev/null | wc -l) || : +write_filecount=false + +if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then + echo " Existing FMA valid" +else + actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || : + if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then + if [ ! -f "${AUDIO_ZIP}" ] ; then + echo " Downloading ${AUDIO_ZIPFILE}" + curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}" + fi + + rm -rf "${AUDIO_DIR}" || : + mkdir "${AUDIO_DIR}" + echo " Unzipping ${AUDIO_ZIPFILE}" + unzip -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}" + fi + if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" + fi + + converter + + actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || : + filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}" + write_filecount=true +fi + +if ${write_filecount} ; then + write_filecounts filecounts "${AUDIO_FILECOUNT}" +fi + +if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" +fi + +if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then + echo " Cleaning up ${AUDIO_DIR}" + rm -rf "${AUDIO_DIR}" +fi + +echo " FMA complete" +exit 0 + diff --git a/cli/setup_mit_audio b/cli/setup_mit_audio new file mode 100755 index 0000000..e5a1f23 --- /dev/null +++ b/cli/setup_mit_audio @@ -0,0 +1,124 @@ +#!/bin/bash +set -euo pipefail + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +source "${PROGDIR}/shell.functions" + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir= ] + + --cleanup-archives : Automatically clean up any downloaded archvies after + extraction. + --cleanup-intermediate-files + : Automatically clean up the intermediate files after they've + : converted to 16k. + : Path to the data directory. + : Default: ${DATA_DIR} + +EOF + exit 1 +fi + +mkdir -p "${DATA_DIR}/training_datasets/downloads" || : +cd "${DATA_DIR}/training_datasets" + +AUDIO_URL="https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip" +AUDIO_ZIPFILE="MIT_RIR_Audio.zip" +AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" +AUDIO_DIR="./mit_rirs" +mkdir -p "${AUDIO_DIR}" || : +AUDIO16K_DIR="./mit_rirs_16k" +mkdir -p "${AUDIO16K_DIR}" || : +AUDIO_FILECOUNT="./downloads/mit_rir_filecount" +AUDIO_IN_GLOB="*.wav" + +declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 ) +get_filecounts filecounts "${AUDIO_FILECOUNT}" + +echo "===== Checking MIT_RIR =====" + +converter() { + source ${DATA_DIR}/.venv/bin/activate + python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF +import os, sys, subprocess, scipy.io.wavfile, numpy as np +from pathlib import Path +import soundfile as sf +import librosa +from tqdm import tqdm + +def write_wav(dst: Path, data: np.ndarray, sr: int): + x = np.clip(data, -1.0, 1.0) + scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16)) + +rir_in = Path(sys.argv[1]) +rir_out = Path(sys.argv[2]) + +waves = list(rir_in.rglob("*.wav")) +try: + print(" MIT RIR normalizing to 16k…") + # Normalize to 16k mono + for p in tqdm(waves, desc=" MIT_RIR (resample 16k mono)"): + outfile = Path(rir_out / p.name) + if outfile.exists(): + continue + a, sr = sf.read(p, always_2d=False) + if a.ndim > 1: + a = a[:, 0] + if sr != 16000: + a, _ = librosa.load(p, sr=16000, mono=True) + write_wav(outfile, a, 16000) + print(" MIT RIR normalization complete") +except Exception as e2: + print(f" MIT RIR fallback failed: {e2}") + raise +EOF +} + +expected_filecount=${filecounts[${AUDIO_ZIPFILE}]} +actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || : +write_filecount=false + +if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then + echo " Existing ${AUDIO16K_DIR} valid" +else + actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || : + if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then + if [ ! -f "${AUDIO_ZIP}" ] ; then + echo " Downloading ${AUDIO_ZIPFILE}" + curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}" + fi + + rm -rf "${AUDIO_DIR}" || : + echo " Unzipping ${AUDIO_ZIPFILE}" + unzip -u -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}" + fi + if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" + fi + + converter + actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || : + filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}" + write_filecount=true +fi + +if ${write_filecount} ; then + write_filecounts filecounts "${AUDIO_FILECOUNT}" +fi + +if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" +fi + +if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then + echo " Cleaning up ${AUDIO_DIR}" + rm -rf "${AUDIO_DIR}" +fi + +echo " MIT_RIR complete" +exit 0 diff --git a/cli/setup_negative_datasets b/cli/setup_negative_datasets new file mode 100755 index 0000000..eec7fee --- /dev/null +++ b/cli/setup_negative_datasets @@ -0,0 +1,85 @@ +#!/bin/bash +set -euo pipefail + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +source "${PROGDIR}/shell.functions" + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: $0 [ --cleanup-archives ] [ --data-dir= ] + + --cleanup-archives : Automatically clean up any downloaded archvies after + extraction. + : Path to the data directory. + : Default: ${DATA_DIR} + +EOF + exit 1 +fi + +mkdir -p "${DATA_DIR}/training_datasets/downloads" || : +cd "${DATA_DIR}/training_datasets" + +mkdir -p ./negative_datasets || : + +NEGATIVE_DATASET_URL="https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main" +declare -a NEGATIVE_DATASETS=( dinner_party dinner_party_eval no_speech speech ) +AUDIO_FILECOUNT="./downloads/negative_filecount" + +declare -A filecounts=( [dinner_party.zip]=0 [dinner_party_eval.zip]=0 [no_speech.zip]=0 [speech.zip]=0 ) +get_filecounts filecounts "${AUDIO_FILECOUNT}" + +echo "===== Checking negative datasets: ${NEGATIVE_DATASETS[*]} =====" +write_filecount=false + +for ds in "${NEGATIVE_DATASETS[@]}" ; do + AUDIO_ZIPFILE="${ds}.zip" + AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" + AUDIO_DIR="./negative_datasets/${ds}" + mkdir -p "${AUDIO_DIR}" || : + + expected_filecount=${filecounts[${AUDIO_ZIPFILE}]} + actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || : + + if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then + echo " Existing ${ds} valid" + continue + fi + + if [ ! -f "${AUDIO_ZIP}" ] ; then + echo " Downloading ${AUDIO_ZIPFILE}" + curl -sfL "${NEGATIVE_DATASET_URL}/${ds}.zip" -o "${AUDIO_ZIP}" + fi + + rm -rf "${AUDIO_DIR}" || : + echo " Unzipping ${AUDIO_ZIPFILE}" + unzip -q -d "./negative_datasets" "${AUDIO_ZIP}" + actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || : + filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}" + write_filecount=true + + if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" + fi +done + +if ${write_filecount} ; then + write_filecounts filecounts "${AUDIO_FILECOUNT}" +fi + +if "${CLEANUP_ARCHIVES}" ; then + for ds in "${NEGATIVE_DATASETS[@]}" ; do + AUDIO_ZIPFILE="${ds}.zip" + AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" + if [ -f "${AUDIO_ZIP}" ] ; then + echo " Cleaning up ${AUDIO_ZIPFILE}" + rm -rf "${AUDIO_ZIP}" + fi + done +fi + +echo " Negative datasets complete" + diff --git a/cli/setup_python_venv b/cli/setup_python_venv new file mode 100755 index 0000000..153d43d --- /dev/null +++ b/cli/setup_python_venv @@ -0,0 +1,183 @@ +#!/bin/bash +PROGDIR="$(dirname $(realpath $0))" + +KNOWN_ARGS=( data-dir python gpu no-gpu ) +source "${PROGDIR}/shell.functions" + +if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then + echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 + HELP=true +fi + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: setup_python_venv [ --gpu | --no-gpu ] [ --verbose ] + +Options: +--gpu: Install the GPU-capable versions of packages if available. This + is the default if the script detects that a GPU is available. + +--no-gpu: Install the non-GPU-capable versions of packages even if + GPU-capable packages are available. This is the default if the script + detects that a GPU is NOT available. + +--verbose: Print the detailed "pip install" output. + +EOF + exit 1 +fi + +[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})" +[ -d "${DATA_DIR}" ] || { + echo "Data directory '${DATA_DIR}' doesn't exist." >&2 + exit 1 +} + +cd "${DATA_DIR}" + +[ -z "${GPU}" ] && { + GPU=false + [ -c /dev/nvidiactl ] && { + GPU=true + echo " Nvidia GPU detected" + } +} + +"${GPU}" || export CUDA_VISIBLE_DEVICES=-1 + +VENV="${DATA_DIR}/.venv" +[ -n "${VIRTUAL_ENV}" ] && deactivate + +if [ -n "${PYTHON}" ] ; then + PYTHONS=( "${PYTHON}" ) + unset PYTHON +else + PYTHONS=( python3.12 python3.10 ) +fi + +for p in "${PYTHONS[@]}" ; do + "${p}" --version &>/dev/null && { PYTHON="${p}" ; break ; } +done + +[ -n "${PYTHON}" ] || { + echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2 + exit 1 +} + +if [ -d "${VENV}" ] ; then + if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then + source "${VENV}/bin/activate" || { + echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2 + exit 1 + } + else + rm -rf "${VENV}" + fi +fi + +echo "===== Setting up Python environment ${VENV} =====" + +if [ -z "$VIRTUAL_ENV" ] ; then + echo " ===== Creating new virtualenv at '${VENV}' =====" +else + echo " ===== Updating virtualenv at '${VENV}' =====" +fi +${PYTHON} -m venv --upgrade-deps "${VENV}" +source "${VENV}/bin/activate" + +set -euo pipefail + +declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) ) +progfiles+=( "${PROGDIR}/shell.functions" ) + +for f in "${progfiles[@]}" ; do + ln -sfr "${f}" ".venv/bin/$(basename ${f})" +done + +# +# Pip doesn't process packages from requirements.txt in +# order but order is important because tensorflow, torch, +# onnxruntime and micro-wake-word all depend on CUDA packages +# at various versions. They need to be installed in this specific +# order or they may not be able to use the GPU. +# +export PIP_PROGRESS_BAR=off +export PIP_NO_COLOR=1 +export PIP_QUIET=0 + +pip_install() { + if $VERBOSE ; then + pip install "$@" || return 1 + else + { pip install "$@" || return 1 ; } | stdbuf -i0 -o0 tr -d '[:print:]' | stdbuf -i0 -o0 tr '\n' '.' + fi + echo +} + +START_TS=$EPOCHSECONDS + +echo " ===== Installing common requirements =====" +pip_install -r "${PROGDIR}/requirements.txt" + +${GPU} && tfgpu='[and-cuda]' || tfgpu="" +echo " ===== Installing Tensorflow${tfgpu} =====" +pip_install ai_edge_litert "tensorflow${tfgpu}==2.20.0" "tensorboard==2.20.0" \ + "tensorboard-data-server==0.7.2" + +${GPU} && torchgpu='--index-url https://download.pytorch.org/whl/cu129' || torchgpu="" +echo " ===== Installing torch and torchaudio ${torchgpu:+[cuda]} =====" +pip_install "torch==2.9.1" "torchaudio==2.9.1" ${torchgpu} + +echo " ===== Checking microwakeword =====" +MWW="${DATA_DIR}/tools/microWakeWord" +if [ ! -d "${MWW}" ] || [ -n "$(git -C "${MWW}" status --porcelain)" ] ; then + rm -rf "${MWW}" || : + echo " Cloning micro-wake-word to ${DATA_DIR}/tools" + git clone https://github.com/TaterTotterson/micro-wake-word "${MWW}" &>/dev/null +fi +echo " Installing microwakeword" +pip_install -e "${MWW}" + +echo " ===== Checking piper-sample-generator =====" +PSG="${DATA_DIR}/tools/piper-sample-generator" +if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then + rm -rf "${PSG}" || : + echo " Cloning piper-sample-generator to ${DATA_DIR}/tools" + git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null +fi +echo " Installing piper-sample-generator" +pip_install -e "${PSG}" +git -C tools/piper-sample-generator clean -fd &>/dev/null + +MODELS_DIR="${PSG}/models" +MODEL_NAME="en_US-libritts_r-medium.pt" +MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}" +MODEL_URL="https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/${MODEL_NAME}" +if [ ! -f "${MODEL_FILE}" ] ; then + echo " Downloading ${MODEL_NAME} for piper-sample-generator" + curl -sfL "${MODEL_URL}" -o "${MODEL_FILE}" +fi + +if [ ! -f "${MODEL_FILE}.json" ] ; then + echo " Downloading ${MODEL_NAME}.json for piper-sample-generator" + curl -sfL "${MODEL_URL}.json" -o "${MODEL_FILE}.json" +fi + +${GPU} && onnxgpu='-gpu[cuda]' || onnxgpu="" +echo " ===== Installing onnxruntime${onnxgpu} =====" +pip_install "onnxruntime${onnxgpu}>=1.16.0" + +echo " ===== Installing keras =====" +# keras 3.13 has "issues" so we need to back down to 3.12. +pip_install "keras==3.12.0" + +${PROGDIR}/test_python --data-dir="${DATA_DIR}" + +touch .mww-data-dir +END_TS=$EPOCHSECONDS + +echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell." + +print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" + + diff --git a/cli/setup_training_datasets b/cli/setup_training_datasets new file mode 100755 index 0000000..fc6e280 --- /dev/null +++ b/cli/setup_training_datasets @@ -0,0 +1,48 @@ +#!/bin/bash +set -euo pipefail + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files ) +source "${PROGDIR}/shell.functions" + +if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then + echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 + HELP=true +fi + +if [ "${HELP}" == "true" ] ; then + cat <&2 +Usage: setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ] + +Options: +--cleanup-archives: Automatically delete the tarballs or zipfiles after + they've been extracted. + +--cleanup-intermediate-files: Automatically delete the intermediate files + after they've been converted. + +EOF + exit 1 +fi + +cd "${DATA_DIR}" + +START_TS=$EPOCHSECONDS +echo -e "\n===== Setting up Training Datasets =====\n" + +${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \ + --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" + +${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \ + --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" + +${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \ + --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" + +${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \ + --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" + +END_TS=$(date +%s.%N) +print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup" diff --git a/cli/shell.functions b/cli/shell.functions new file mode 100644 index 0000000..07b3b02 --- /dev/null +++ b/cli/shell.functions @@ -0,0 +1,150 @@ + +if [ "$0" == "${BASH_SOURCE[0]}" ] ; then + echo "${BASH_SOURCE[0]} is meant to be 'sourced' not run directly" >&2 + exit 1 +fi + +if [ ! -v DATA_DIR ] ; then + [ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data" +fi + +DEFAULT_SAMPLES=20000 +DEFAULT_BATCH_SIZE=100 +DEFAULT_TRAINING_STEPS=25000 + +[ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || : + +: "${SAMPLES:=${DEFAULT_SAMPLES}}" +: "${BATCH_SIZE:=${DEFAULT_BATCH_SIZE}}" +: "${TRAINING_STEPS:=${DEFAULT_TRAINING_STEPS}}" +: "${CLEANUP_WORK_DIR:=false}" +: "${CLEANUP_ARCHIVES:=false}" +: "${CLEANUP_INTERMEDIATE_FILES:=false}" +: "${QUIET:=false}" +: "${VERBOSE:=false}" + +HELP=false + +if [ -v KNOWN_ARGS ] ; then + KNOWN_ARGS+=( help verbose quiet h v q ) +fi +declare -gi OPTION_COUNT=0 +declare -ga POSITIONAL_ARGS=() +declare -ga EXTRA_ARGS=() +declare -ga UNKNOWN_ARGS=() +declare -i __stop_parsing=0 +for a in "$@"; do + if [ "$a" == "--" ] ; then + __stop_parsing=1 + shift + continue + fi + if [ $__stop_parsing == 1 ] ; then + EXTRA_ARGS+=( "$a" ) + shift + continue + fi + + if [ -v KNOWN_ARGS ] && [[ "${a}" =~ ^--?([^=]+)=?.* ]] ; then + _arg=${BASH_REMATCH[1]} + known=false + for _k in "${KNOWN_ARGS[@]}" ; do + [ "${_arg}" == "${_k}" ] && { known=true ; break ; } || : + done + $known || UNKNOWN_ARGS+=( "${a}" ) + fi + OPTION_COUNT+=1 + case "$a" in + -h | --help) + HELP=true + break + ;; + -q | --quiet) + QUIET=true + break + ;; + -v | --verbose) + VERBOSE=true + break + ;; + --*=*) + [[ $a =~ --([^=]+)=(.*) ]] + l=${BASH_REMATCH[1]//-/_} + declare -n var="${l^^}" + var="${BASH_REMATCH[2]}" + ;; + --no-*) + [[ $a =~ --no-(.+) ]] + l=${BASH_REMATCH[1]//-/_} + declare -n var="${l^^}" + var=false + ;; + --*) + [[ $a =~ --(.+) ]] + l=${BASH_REMATCH[1]//-/_} + declare -n var="${l^^}" + var=true + ;; + *) + POSITIONAL_ARGS+=( "$a" ) + ;; + esac +done + + +print_elapsed_time() { + print_seps=True + if [ "$1" == "--no-separators" ] ; then + shift + print_seps=False + fi + local START_TS=${1:?"Usage: $0 "} + local END_TS=${2:?"Usage: $0 "} + message="${3}" + python <> "${af}" + done +} diff --git a/cli/system_summary b/cli/system_summary new file mode 100755 index 0000000..da4f7c1 --- /dev/null +++ b/cli/system_summary @@ -0,0 +1,18 @@ +#!/bin/bash +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +CUDA_INFO=$("${PROGDIR}/cudainfo") +CUDA_CORES=$(sed -n -r -e "s/\s*Total\s+CUDA\s+Cores:\s+([0-9]+)$/\1/gp" <<<${CUDA_INFO}) +GPU_NAME="$(sed -n -r -e 's/\s*GPU\s+Name:\s+(.+)$/\1/gp' <<<${CUDA_INFO})" +GPU_MEMORY="$(sed -n -r -e 's/\s*Total\s+Memory:\s*([0-9.]+).*/\1/gp' <<<${CUDA_INFO})" +CPU_NAME="$(sed -n -r -e 's/model\s+name\s*:\s*(.+)$/\1/gp' /proc/cpuinfo | head -1)" +CPU_CORES="$(nproc)" +SYS_MEMORY="$(free -m | sed -n -r -e 's/Mem:\s+([0-9.]+)\s+.*/\1/gp')" + +printf "CPU: %s (%d cores) Memory: %s mb\n" "${CPU_NAME}" "${CPU_CORES}" "${SYS_MEMORY}" +if [ -z "${GPU_NAME}" ] ; then + printf "GPU: N/A\n" +else + printf "GPU: %s (%d cores) Memory: %s mb\n" "${GPU_NAME}" "${CUDA_CORES}" "${GPU_MEMORY}" +fi diff --git a/cli/tensorboard1.png b/cli/tensorboard1.png new file mode 100644 index 0000000..a7741d9 Binary files /dev/null and b/cli/tensorboard1.png differ diff --git a/cli/tensorboard2.png b/cli/tensorboard2.png new file mode 100644 index 0000000..9042fdc Binary files /dev/null and b/cli/tensorboard2.png differ diff --git a/cli/tensorboard3.png b/cli/tensorboard3.png new file mode 100644 index 0000000..6df0306 Binary files /dev/null and b/cli/tensorboard3.png differ diff --git a/cli/test_python b/cli/test_python new file mode 100755 index 0000000..50a2718 --- /dev/null +++ b/cli/test_python @@ -0,0 +1,129 @@ +#!/bin/bash + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") +TRAINING_STEPS=40000 +DATA_DIR=/data +source "${PROGDIR}/shell.functions" + +source "${DATA_DIR}/.venv/bin/activate" + +export TF_CPP_MIN_LOG_LEVEL=9 +export GLOG_minloglevel=2 +export GRPC_VERBOSITY="ERROR" + +echo -e "\n===== Testing Python Environment =====\n" + +echo -e "\n===== Testing Cuda =====\n" +"${PROGDIR}/cudainfo" + +python - 2>/dev/null <= 0: + result = "Available - " + c.device + else: + result = "Not available" + except: + result = "Not available" + + print("GPU:", result) + + try: + with tf.device("/CPU:0"): + a = tf.random.normal([10000, 10000]) + b = tf.random.normal([10000, 10000]) + c = tf.matmul(a, b) + result = "Available - " + c.device + except: + result = "Not available" + + print("CPU:", result) +except: + print("Tensorflow not available") +EOF + + +python - 2>/dev/null </dev/null </dev/null </dev/null && { + echo "piper-sample-generator available" +} || { + echo "piper-sample-generator not available" +} + +echo +echo -e "\n===== Python Environment Testing Complete =====\n" diff --git a/cli/train_wake_word b/cli/train_wake_word new file mode 100755 index 0000000..b52adcf --- /dev/null +++ b/cli/train_wake_word @@ -0,0 +1,125 @@ +#!/bin/bash +set -e + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir ) +source "${PROGDIR}/shell.functions" +WAKE_WORD=${POSITIONAL_ARGS[0]} + +if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then + echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 + HELP=true +fi + +if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then + cat <&2 +Usage: train_wake_word [ --samples= ] [ --batch-size= ] + [ --training-steps= ] [ --cleanup-work-dir ] + [ ] + +Options: +--samples: The number of samples to generate for the wake word. + Default: ${DEFAULT_SAMPLES} + +--batch-size: How many samples should be generated at a time. The more + samples per batch, the more memory is needed. + Default: ${DEFAULT_BATCH_SIZE} + +--training-steps: Number of training steps. More training steps means better + detection and false positive rates but also more time to train. + Default: ${DEFAULT_TRAINING_STEPS} + +--cleanup-work-dir: Delete the /data/work directory after successful training. + Default: false + + The word to train spelled phonetically. + Required. + + An optional pretty name to save to the json metadata file. + Default: The wake word with individual words capitalized + and punctuation removed. + +EOF + exit 1 +fi + +# shellcheck source=/dev/null +source "${DATA_DIR}/.venv/bin/activate" + +cd "${DATA_DIR}" +mkdir -p "${DATA_DIR}/work" || : + +[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" || : + +if [ ! -v WAKE_WORD_TITLE ] ; then + declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } ) + WAKE_WORD_TITLE="${WWNA[*]^}" +elif [ -z "$WAKE_WORD_TITLE" ] ; then + WAKE_WORD_TITLE="$WAKE_WORD" +fi + +printf "%-80s\n" "=" | tr ' ' "=" +echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training =====" +"${PROGDIR}/cudainfo" +echo +START_TS=$EPOCHSECONDS + +export TF_CPP_MIN_LOG_LEVEL=9 +export TF_FORCE_GPU_ALLOW_GROWTH=true +export TF_GPU_ALLOCATOR=cuda_malloc_async +export TF_XLA_FLAGS="--tf_xla_auto_jit=0" +export NVIDIA_TF32_OVERRIDE=1 +export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 +export GLOG_minloglevel=2 +export GRPC_VERBOSITY=ERROR + + +"${PROGDIR}/wake_word_sample_generator" \ + --samples=${SAMPLES} \ + --batch-size=${BATCH_SIZE} \ + --data-dir="${DATA_DIR}" "${WAKE_WORD}" + +POST_GEN_TS=$EPOCHSECONDS + +ww="${WAKE_WORD// /_}" +ww="${ww//./}" + +AUGMENT=false +GENERATED_DIR="${DATA_DIR}/work/wake_word_samples" +AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented" + +[ -d "${AUGMENTED_DIR}" ] || AUGMENT=true +[ "${GENERATED_DIR}/0.wav" -nt "${AUGMENTED_DIR}/testing/wakeword_mmap/data.ninja" ] && AUGMENT=true || : + +if ${AUGMENT} ; then + rm -rf "${AUGMENTED_DIR}" || : + mkdir -p "${AUGMENTED_DIR}" || : + "${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; } +else + echo "Augmentation not required" + echo +fi + +POST_AUGMENT_TS=$EPOCHSECONDS + +"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \ + "${WAKE_WORD}" "${WAKE_WORD_TITLE}" + +if ${CLEANUP_WORK_DIR} ; then + rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \ + "${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || : +fi +END_TS=$EPOCHSECONDS + +python -c $'print(f"{\'=\' * 80}")' +printf "%44s\n\n" "Training Summary" +"${PROGDIR}/system_summary" +echo +print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch" +print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples" +print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps" +python -c $'msg="="*54 ; print(f"{msg:>80s}")' +print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total" +python -c $'print(f"{\'=\' * 80}")' diff --git a/cli/wake_word_sample_augmenter b/cli/wake_word_sample_augmenter new file mode 100755 index 0000000..3e2e5b8 --- /dev/null +++ b/cli/wake_word_sample_augmenter @@ -0,0 +1,215 @@ +#!/usr/bin/env python + +import sys, os, gc, glob, random +import types, shutil, json +from datetime import datetime, timezone +from pathlib import Path +from argparse import ArgumentParser as ArgParser, ArgumentError + +default_data_dir = os.getcwd() if os.path.exists(".mww-data-dir") else "/data" + +parser = ArgParser(exit_on_error=False) +parser.add_argument("--data-dir", type=str, help=f"Data directory. Default: {default_data_dir}", required=False, default=default_data_dir) +parser.add_argument("--input-dir", type=str, help="Sample input directory. Default: /work/wake_word_samples", required=False) +parser.add_argument("--output-dir", type=str, help="Sample output directory. Default: _augmented", required=False) +parser.add_argument("--mit-rirs-16k-dir", type=str, help="MIT RIR input directory. Default: /training_datasets/mit_rirs_16k", required=False) +parser.add_argument("--fma-16k-dir", type=str, help="FMA input directory. Default: /training_datasets/fma_16k", required=False) +parser.add_argument("--audioset-16k-dir", type=str, help="Audioset input directory. Default: /training_datasets/audioset_16k", required=False) + +try: + args = parser.parse_args() +except ArgumentError: + parser.print_help() + sys.exit(1) + +args.data_dir = os.path.realpath(args.data_dir) +work_dir = args.data_dir + "/work" + +if not args.input_dir: + args.input_dir = work_dir + "/wake_word_samples" +else: + args.input_dir = os.path.realpath(args.input_dir) + +if not args.output_dir: + args.output_dir = args.input_dir + "_augmented" +else: + args.output_dir = os.path.realpath(args.output_dir) + +if not args.mit_rirs_16k_dir: + args.mit_rirs_16k_dir = args.data_dir + "/training_datasets/mit_rirs_16k" +else: + args.mit_rirs_16k_dir = os.path.realpath(args.mit_rirs_16k_dir) + +if not args.fma_16k_dir: + args.fma_16k_dir = args.data_dir + "/training_datasets/fma_16k" +else: + args.fma_16k_dir = os.path.realpath(args.fma_16k_dir) + +if not args.audioset_16k_dir: + args.audioset_16k_dir = args.data_dir + "/training_datasets/audioset_16k" +else: + args.audioset_16k_dir = os.path.realpath(args.audioset_16k_dir) + +out_path = Path(args.output_dir) +out_path.mkdir(exist_ok=True) + +def validate_directories(paths): + for path in paths: + if not os.path.exists(path): + print(f"Error: Directory {path} does not exist. Please ensure preprocessing is complete.") + return False + return True + +paths = [ work_dir, args.input_dir, args.output_dir, args.mit_rirs_16k_dir, args.fma_16k_dir, args.audioset_16k_dir ] +if not validate_directories(paths): + parser.print_help() + sys.exit(1) + +files = glob.glob(args.input_dir + "/*.wav") +if not files: + raise RuntimeError("❌ No WAVs in wake_word_samples.") +max_samples = len(files) + +print(f"\n===== Augmenting {max_samples} wake word samples =====") + +print(" Initializing libraries") + +os.environ["TF_CPP_MIN_LOG_LEVEL"]="3" +os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true" +os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async" +os.environ["TF_XLA_FLAGS"]="--tf_xla_auto_jit=0" +os.environ["NVIDIA_TF32_OVERRIDE"]="1" +os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"]="512" +os.environ["GLOG_minloglevel"]="9" +os.environ["GRPC_VERBOSITY"]="ERROR" + +print(" Loading Tensorflow") +import tensorflow as tf + +print(" GPU memory config") +# Per-device memory growth (belt + suspenders) +for g in tf.config.list_physical_devices("GPU"): + try: + tf.config.experimental.set_memory_growth(g, True) + except Exception: + pass +print(f" GPUs: {tf.config.list_physical_devices('GPU')}") +gc.collect() + +import numpy as np +import librosa +from mmap_ninja.ragged import RaggedMmap +from microwakeword.audio.augmentation import Augmentation +from microwakeword.audio.clips import Clips +from microwakeword.audio.spectrograms import SpectrogramGeneration +from microwakeword.audio.audio_utils import save_clip + +START_TIME = datetime.now(timezone.utc).replace(microsecond=0) + +# Paths to augmented data +impulse_paths = [ args.mit_rirs_16k_dir ] +background_paths = [ args.fma_16k_dir, args.audioset_16k_dir] + +clips = Clips( + input_directory=args.input_dir, + file_pattern='*.wav', + max_clip_duration_s=5, + remove_silence=True, + random_split_seed=10, + split_count=0.1, +) + +augmenter = Augmentation( + augmentation_duration_s=3.2, + augmentation_probabilities={ + "SevenBandParametricEQ": 0.1, + "TanhDistortion": 0.05, + "PitchShift": 0.15, + "BandStopFilter": 0.1, + "AddColorNoise": 0.1, + "AddBackgroundNoise": 0.7, + "Gain": 0.8, + "RIR": 0.7, + }, + impulse_paths=impulse_paths, + background_paths=background_paths, + background_min_snr_db=5, + background_max_snr_db=10, + min_jitter_s=0.2, + max_jitter_s=0.3, +) + +# Augment samples and save the training, validation, and testing sets. + +def audio_generator_from_wavs(self, split="train", repeat=1): + """ + Yield 1-D float32 arrays loaded via librosa from input_dir/*.wav. + Deterministic 80/10/10 split with seed 10 to mirror original Clips behavior. + """ + files = sorted(glob.glob(args.input_dir + "/*.wav")) + if not files: + raise RuntimeError("❌ No WAVs in wake_word_samples.") + + rng = random.Random(10) # deterministic shuffling like Clips(random_split_seed=10) + files_shuf = files[:] + rng.shuffle(files_shuf) + + n = len(files_shuf) + n_val = max(1, int(0.10 * n)) + n_test = max(1, int(0.10 * n)) + n_train = max(0, n - n_val - n_test) + splits = { + "train": files_shuf[:n_train], + "validation": files_shuf[n_train:n_train + n_val], + "test": files_shuf[n_train + n_val:], + } + file_list = splits.get(split, []) + if not file_list: + return # nothing to yield + + for _ in range(max(1, int(repeat))): + for p in file_list: + y, sr = librosa.load(p, sr=16000, mono=True) + yield y.astype(np.float32, copy=False) + +# Bind the patched generator to your existing `clips` instance +clips.audio_generator = types.MethodType(audio_generator_from_wavs, clips) + +# ---- Split config (same as before) ---- +split_cfg = { + "training": {"name": "train", "repetition": 2, "slide_frames": 10}, + "validation": {"name": "validation", "repetition": 1, "slide_frames": 10}, + "testing": {"name": "test", "repetition": 1, "slide_frames": 1}, +} + +# ---- Generate features ---- +for split, cfg in split_cfg.items(): + out_dir = out_path / split + out_dir.mkdir(parents=True, exist_ok=True) + print(f" Augmenting {split}") + + print(f" Generating spectrograms") + spectros = SpectrogramGeneration( + clips=clips, # now backed by our WAV loader + augmenter=augmenter, # your existing augmenter + slide_frames=cfg["slide_frames"], + step_ms=10, + ) + + print(f" Generating files") + RaggedMmap.from_generator( + out_dir=str(out_dir / "wakeword_mmap"), + sample_generator=spectros.spectrogram_generator( + split=cfg["name"], repeat=cfg["repetition"] + ), + batch_size=100, + verbose=False, + ) + print(f" {split} augmentation complete") + +END_TIME = datetime.now(timezone.utc).replace(microsecond=0) +et = END_TIME - START_TIME +print(f"\n{'=' * 80}") +msg=f"Augmented {max_samples} wake word samples." +print(f"{msg:>50s} Elapsed time: {et!s}") +print(f"{'=' * 80}\n") diff --git a/cli/wake_word_sample_generator b/cli/wake_word_sample_generator new file mode 100755 index 0000000..3afcd6c --- /dev/null +++ b/cli/wake_word_sample_generator @@ -0,0 +1,112 @@ +#!/bin/bash +set -e + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +KNOWN_ARGS=( samples batch-size data-dir ) +source "${PROGDIR}/shell.functions" +WAKE_WORD="${POSITIONAL_ARGS[0]}" + +if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then + echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 + HELP=true +fi + +if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then + cat <&2 +Usage: $0 [ --samples= ] [ --batch-size= ] + +--samples: The number of samples to generate for the wake word. + Default: ${DEFAULT_SAMPLES} + +--batch-size: How many samples should be generated at a time. The more + samples, the more memory is needed. + Default: ${DEFAULT_BATCH_SIZE} + + The word to generate samples for. + Required. + +EOF + exit 1 +fi + +# shellcheck source=/dev/null +source "${DATA_DIR}/.venv/bin/activate" + +WORK_DIR="${DATA_DIR}/work" +mkdir -p "${WORK_DIR}" || : +cd "${WORK_DIR}" + +PSG="${DATA_DIR}/tools/piper-sample-generator" +MODELS_DIR="${PSG}/models" +MODEL_NAME=en_US-libritts_r-medium.pt +MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}" +SAMPLES_DIR="${WORK_DIR}/wake_word_samples" + +mkdir -p "${SAMPLES_DIR}" || : + +REGENERATE=false + +if [ "${SAMPLES}" -eq 1 ] ; then + echo "===== Generating ${SAMPLES} sample of '${WAKE_WORD}' =====" + wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}" + + mkdir -p "${WORK_DIR}/test_sample" || : + "${PSG}/generate_samples.py" "${WAKE_WORD}" \ + --model "${MODEL_FILE}" \ + --max-samples ${SAMPLES} \ + --batch-size ${BATCH_SIZE} \ + --output-dir "${WORK_DIR}/test_sample" \ + --max-speakers 100 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g" + mv "${WORK_DIR}/test_sample/0.wav" "${WORK_DIR}/test_sample/${wake_word_filename}.wav" + echo "Sample available at ${WORK_DIR}/test_sample/${wake_word_filename}.wav" + echo "Play it from your host." + exit 0 +fi + +grep -q "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" "${WORK_DIR}/last_wake_word" &>/dev/null || REGENERATE=true + +# Double check that the number of existing samples matches SAMPLES" +existing_samples=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l) +[ "${existing_samples}" -eq "${SAMPLES}" ] || REGENERATE=true + +START_TS=$EPOCHSECONDS + +if ! ${REGENERATE} ; then + echo "Sample generation not required" + echo + exit 0 +fi + +echo -e "\n===== Generating ${SAMPLES} wake word samples in batches of ${BATCH_SIZE} =====" +export TF_CPP_MIN_LOG_LEVEL=9 +export TF_FORCE_GPU_ALLOW_GROWTH=true +export TF_GPU_ALLOCATOR=cuda_malloc_async +export TF_XLA_FLAGS="--tf_xla_auto_jit=0" +export NVIDIA_TF32_OVERRIDE=1 +export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 +export GLOG_minloglevel=2 +export GRPC_VERBOSITY=ERROR + +echo " Generating samples" +rm -rf "${SAMPLES_DIR}" || : +mkdir -p "${SAMPLES_DIR}" || : +"${PSG}/generate_samples.py" "${WAKE_WORD}" \ + --model "${MODEL_FILE}" \ + --max-samples ${SAMPLES} \ + --batch-size ${BATCH_SIZE} \ + --output-dir "${SAMPLES_DIR}" 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g" + +generated_files=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l) +if [ "${generated_files}" -ne "${SAMPLES}" ] ; then + echo "ERROR: only generated ${generated_files} files" >&2 + exit 1 +fi +END_TS=$(date +%s.%N) +echo "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" > "${WORK_DIR}/last_wake_word" +echo +END_TS=$EPOCHSECONDS +print_elapsed_time "${START_TS}" "${END_TS}" "Generated ${SAMPLES} wake word samples." + +exit 0 diff --git a/cli/wake_word_sample_trainer b/cli/wake_word_sample_trainer new file mode 100755 index 0000000..743b3fe --- /dev/null +++ b/cli/wake_word_sample_trainer @@ -0,0 +1,241 @@ +#!/bin/bash +set -e + +PROGPATH=$(realpath "$0") +PROGDIR=$(dirname "${PROGPATH}") + +KNOWN_ARGS=( training-steps samples data-dir ) +source "${PROGDIR}/shell.functions" +WAKE_WORD="${POSITIONAL_ARGS[0]}" + +if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then + echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 + HELP=true +fi + +if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then + cat <&2 +Usage: $0 [ --samples= ] [ --training-steps= ] + [ ] + + $0 -h/--help + +--samples: The number of samples to generate for the wake word. + Used only to generate output file names. + +--training-steps: Number of training steps. + Default: ${DEFAULT_TRAINING_STEPS} + +: The word to train spelled phonetically. + Required. + +: A pretty name to save to the json metadata file. + Default: The wake word with individual words capitalized. + +EOF + exit 1 +fi + +WORK_DIR="${DATA_DIR}/work" +TRAINING_DS="${DATA_DIR}/training_datasets" + +[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" + +if [ ! -v WAKE_WORD_TITLE ] ; then + declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } ) + WAKE_WORD_TITLE="${WWNA[*]^}" +elif [ -z "$WAKE_WORD_TITLE" ] ; then + WAKE_WORD_TITLE="$WAKE_WORD" +fi + +# shellcheck source=/dev/null +source "${DATA_DIR}/.venv/bin/activate" + +check_directories() { + for d in "$@" ; do + [ -d "$d" ] || { echo "ERROR: Directory $d not found" >&2 ; exit 1 ; } + done +} + +check_directories ${WORK_DIR}/wake_word_samples_augmented \ + ${TRAINING_DS}/negative_datasets/{speech,dinner_party,no_speech,dinner_party_eval} + +cd "${WORK_DIR}" + +echo "===== Starting ${TRAINING_STEPS} training steps =====" + +START_TS=$EPOCHSECONDS + +mkdir -p "${WORK_DIR}/trained_models" || : +cat <"${WORK_DIR}/trained_models/training_parameters.yaml" +batch_size: 16 +clip_duration_ms: 1500 +eval_step_interval: 500 +features: +- features_dir: ${WORK_DIR}/wake_word_samples_augmented + penalty_weight: 1.0 + sampling_weight: 2.0 + truncation_strategy: truncate_start + truth: true + type: mmap +- features_dir: ${TRAINING_DS}/negative_datasets/speech + penalty_weight: 1.0 + sampling_weight: 12.0 + truncation_strategy: random + truth: false + type: mmap +- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party + penalty_weight: 1.0 + sampling_weight: 12.0 + truncation_strategy: random + truth: false + type: mmap +- features_dir: ${TRAINING_DS}/negative_datasets/no_speech + penalty_weight: 1.0 + sampling_weight: 5.0 + truncation_strategy: random + truth: false + type: mmap +- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party_eval + penalty_weight: 1.0 + sampling_weight: 0.0 + truncation_strategy: split + truth: false + type: mmap +freq_mask_count: +- 0 +freq_mask_max_size: +- 0 +learning_rates: +- 0.001 +maximization_metric: average_viable_recall +minimization_metric: null +negative_class_weight: +- 20 +positive_class_weight: +- 1 +target_minimization: 0.9 +time_mask_count: +- 0 +time_mask_max_size: +- 0 +train_dir: ${WORK_DIR}/trained_models/wakeword +training_steps: +- ${TRAINING_STEPS} +window_step_ms: 10 + +EOF + +echo " Wrote training_parameters.yaml" +rm -rf "${WORK_DIR}/trained_models/wakeword" + +export TF_CPP_MIN_LOG_LEVEL=9 +export TF_FORCE_GPU_ALLOW_GROWTH=true +export TF_GPU_ALLOCATOR=cuda_malloc_async +export TF_XLA_FLAGS="--tf_xla_auto_jit=0" +export NVIDIA_TF32_OVERRIDE=1 +export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 +export GLOG_minloglevel=9 +export GRPC_VERBOSITY=ERROR + +echo " Loading Tensorflow" + +wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}" +OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}" +mkdir -p "${OUTPUT_DIR}/logs" || : + +python - \ + --training_config="${WORK_DIR}/trained_models/training_parameters.yaml" \ + --train 1 \ + --restore_checkpoint 1 \ + --test_tf_nonstreaming 0 \ + --test_tflite_nonstreaming 0 \ + --test_tflite_nonstreaming_quantized 0 \ + --test_tflite_streaming 0 \ + --test_tflite_streaming_quantized 1 \ + --use_weights "best_weights" \ + mixednet \ + --pointwise_filters "64,64,64,64" \ + --repeat_in_block "1,1,1,1" \ + --mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" \ + --residual_connection "0,0,0,0" \ + --first_conv_filters 32 \ + --first_conv_kernel_size 5 \ + --stride 2 <&1 | tr '\r' '\n' | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" |\ + tee "${OUTPUT_DIR}/logs/training.log" | sed -r -e '/^INFO:absl:/!d' \ + -r -e "/None|Sharding|unsupported characters|AUC|fingerprint/d" \ + -r -e 's/INFO:absl:/ /g' \ + -r -e "s/, (recall =|estimated false|average viable recall)/,\n \1/g" + +import sys, os, gc +import runpy +import yaml +print(" Loading Tensorflow") +import tensorflow as tf + +print(" GPU memory config") +# Per-device memory growth (belt + suspenders) +for g in tf.config.list_physical_devices("GPU"): + try: + tf.config.experimental.set_memory_growth(g, True) + except Exception: + pass +print(f"INFO:absl:GPUs: {tf.config.list_physical_devices('GPU')}") +gc.collect() + +print() +try: + runpy.run_module("microwakeword.model_train_eval", run_name="__main__", alter_sys=True) +except Exception as e: + print(e, file=sys.stderr) + sys.exit(1) +EOF + +source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite" + +if [ ! -f "${source_path}" ] ; then + echo "Output model not found! Training didn't complete successfully. See ${WORK_DIR}/training.log" + exit 1 +fi + +cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/" +cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/" +cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/" + +echo -e "\n Training complete!" +echo " Full log: ${OUTPUT_DIR}/logs/training.log" + +tflite_filename="${wake_word_filename}.tflite" +tflite_path="${OUTPUT_DIR}/${tflite_filename}" + +cp "${source_path}" "${tflite_path}" + +# --- Write JSON metadata file with matching model name --- +json_path="${OUTPUT_DIR}/${wake_word_filename}.json" +cat <<-EOF > "${json_path}" +{ + "type": "micro", + "wake_word": "${WAKE_WORD_TITLE}", + "author": "Tater Totterson", + "website": "https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git", + "model": "${tflite_filename}", + "trained_languages": ["en"], + "version": 2, + "micro": { + "probability_cutoff": 0.97, + "sliding_window_size": 5, + "feature_step_size": 10, + "tensor_arena_size": 30000, + "minimum_esphome_version": "2024.7.0" + } +} +EOF + +echo "Name: ${WAKE_WORD_TITLE}" +echo "Model: ${tflite_path}" +echo "Metadata: ${json_path}" +echo +END_TS=$EPOCHSECONDS +print_elapsed_time "${START_TS}" "${END_TS}" "Training completed." +echo +