Train from the command line

The files in the `cli` directory allow you to train wake words
from the command line without needing to use the Jupyter notebook
or a web browser.  Basically, the logic from the notebook has been
placed in separate shell scripts and python files wrapped by 3 high-level
scripts that do the following:

* setup_python_venv: Creates a Python virtual environment with all the
packages needed to train.  The venv is created in the container's /data
directory and is therefore stored on the host, not in the container's root
docker volume.

* setup_training_datasets: Downloads, extracts and converts the MIT RIR,
FMA, Audioset and Negative training reference datasets.  Also stored in /data.

* train_wake_word: Generates the wake word samples, augments them with the
audio from the training datasets, and finally runs the microwakeword training.
The resulting model tflite and json files are placed in the /data/output
directory.

See the README.md file for much more information.
This commit is contained in:
George Joseph
2025-12-27 12:32:06 -07:00
parent 4dd7503248
commit cb81f7f02d
21 changed files with 2468 additions and 0 deletions

85
cli/setup_negative_datasets Executable file
View File

@@ -0,0 +1,85 @@
#!/bin/bash
set -euo pipefail
PROGPATH=$(realpath "$0")
PROGDIR=$(dirname "${PROGPATH}")
source "${PROGDIR}/shell.functions"
if [ "${HELP}" == "true" ] ; then
cat <<EOF >&2
Usage: $0 [ --cleanup-archives ] [ --data-dir=<data_dir> ]
--cleanup-archives : Automatically clean up any downloaded archvies after
extraction.
<data_dir> : Path to the data directory.
: Default: ${DATA_DIR}
EOF
exit 1
fi
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
cd "${DATA_DIR}/training_datasets"
mkdir -p ./negative_datasets || :
NEGATIVE_DATASET_URL="https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main"
declare -a NEGATIVE_DATASETS=( dinner_party dinner_party_eval no_speech speech )
AUDIO_FILECOUNT="./downloads/negative_filecount"
declare -A filecounts=( [dinner_party.zip]=0 [dinner_party_eval.zip]=0 [no_speech.zip]=0 [speech.zip]=0 )
get_filecounts filecounts "${AUDIO_FILECOUNT}"
echo "===== Checking negative datasets: ${NEGATIVE_DATASETS[*]} ====="
write_filecount=false
for ds in "${NEGATIVE_DATASETS[@]}" ; do
AUDIO_ZIPFILE="${ds}.zip"
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
AUDIO_DIR="./negative_datasets/${ds}"
mkdir -p "${AUDIO_DIR}" || :
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
echo " Existing ${ds} valid"
continue
fi
if [ ! -f "${AUDIO_ZIP}" ] ; then
echo " Downloading ${AUDIO_ZIPFILE}"
curl -sfL "${NEGATIVE_DATASET_URL}/${ds}.zip" -o "${AUDIO_ZIP}"
fi
rm -rf "${AUDIO_DIR}" || :
echo " Unzipping ${AUDIO_ZIPFILE}"
unzip -q -d "./negative_datasets" "${AUDIO_ZIP}"
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
write_filecount=true
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
done
if ${write_filecount} ; then
write_filecounts filecounts "${AUDIO_FILECOUNT}"
fi
if "${CLEANUP_ARCHIVES}" ; then
for ds in "${NEGATIVE_DATASETS[@]}" ; do
AUDIO_ZIPFILE="${ds}.zip"
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
if [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
done
fi
echo " Negative datasets complete"