Files
microWakeWord-Trainer-Nvidi…/cli/setup_negative_datasets
George Joseph cb81f7f02d Train from the command line
The files in the `cli` directory allow you to train wake words
from the command line without needing to use the Jupyter notebook
or a web browser.  Basically, the logic from the notebook has been
placed in separate shell scripts and python files wrapped by 3 high-level
scripts that do the following:

* setup_python_venv: Creates a Python virtual environment with all the
packages needed to train.  The venv is created in the container's /data
directory and is therefore stored on the host, not in the container's root
docker volume.

* setup_training_datasets: Downloads, extracts and converts the MIT RIR,
FMA, Audioset and Negative training reference datasets.  Also stored in /data.

* train_wake_word: Generates the wake word samples, augments them with the
audio from the training datasets, and finally runs the microwakeword training.
The resulting model tflite and json files are placed in the /data/output
directory.

See the README.md file for much more information.
2025-12-28 12:48:51 -07:00

86 lines
2.6 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
PROGPATH=$(realpath "$0")
PROGDIR=$(dirname "${PROGPATH}")
source "${PROGDIR}/shell.functions"
if [ "${HELP}" == "true" ] ; then
cat <<EOF >&2
Usage: $0 [ --cleanup-archives ] [ --data-dir=<data_dir> ]
--cleanup-archives : Automatically clean up any downloaded archvies after
extraction.
<data_dir> : Path to the data directory.
: Default: ${DATA_DIR}
EOF
exit 1
fi
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
cd "${DATA_DIR}/training_datasets"
mkdir -p ./negative_datasets || :
NEGATIVE_DATASET_URL="https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main"
declare -a NEGATIVE_DATASETS=( dinner_party dinner_party_eval no_speech speech )
AUDIO_FILECOUNT="./downloads/negative_filecount"
declare -A filecounts=( [dinner_party.zip]=0 [dinner_party_eval.zip]=0 [no_speech.zip]=0 [speech.zip]=0 )
get_filecounts filecounts "${AUDIO_FILECOUNT}"
echo "===== Checking negative datasets: ${NEGATIVE_DATASETS[*]} ====="
write_filecount=false
for ds in "${NEGATIVE_DATASETS[@]}" ; do
AUDIO_ZIPFILE="${ds}.zip"
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
AUDIO_DIR="./negative_datasets/${ds}"
mkdir -p "${AUDIO_DIR}" || :
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
echo " Existing ${ds} valid"
continue
fi
if [ ! -f "${AUDIO_ZIP}" ] ; then
echo " Downloading ${AUDIO_ZIPFILE}"
curl -sfL "${NEGATIVE_DATASET_URL}/${ds}.zip" -o "${AUDIO_ZIP}"
fi
rm -rf "${AUDIO_DIR}" || :
echo " Unzipping ${AUDIO_ZIPFILE}"
unzip -q -d "./negative_datasets" "${AUDIO_ZIP}"
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
write_filecount=true
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
done
if ${write_filecount} ; then
write_filecounts filecounts "${AUDIO_FILECOUNT}"
fi
if "${CLEANUP_ARCHIVES}" ; then
for ds in "${NEGATIVE_DATASETS[@]}" ; do
AUDIO_ZIPFILE="${ds}.zip"
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
if [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
done
fi
echo " Negative datasets complete"