Train from the command line

The files in the `cli` directory allow you to train wake words
from the command line without needing to use the Jupyter notebook
or a web browser.  Basically, the logic from the notebook has been
placed in separate shell scripts and python files wrapped by 3 high-level
scripts that do the following:

* setup_python_venv: Creates a Python virtual environment with all the
packages needed to train.  The venv is created in the container's /data
directory and is therefore stored on the host, not in the container's root
docker volume.

* setup_training_datasets: Downloads, extracts and converts the MIT RIR,
FMA, Audioset and Negative training reference datasets.  Also stored in /data.

* train_wake_word: Generates the wake word samples, augments them with the
audio from the training datasets, and finally runs the microwakeword training.
The resulting model tflite and json files are placed in the /data/output
directory.

See the README.md file for much more information.
This commit is contained in:
George Joseph
2025-12-27 12:32:06 -07:00
parent 4dd7503248
commit cb81f7f02d
21 changed files with 2468 additions and 0 deletions

150
cli/shell.functions Normal file
View File

@@ -0,0 +1,150 @@
if [ "$0" == "${BASH_SOURCE[0]}" ] ; then
echo "${BASH_SOURCE[0]} is meant to be 'sourced' not run directly" >&2
exit 1
fi
if [ ! -v DATA_DIR ] ; then
[ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data"
fi
DEFAULT_SAMPLES=20000
DEFAULT_BATCH_SIZE=100
DEFAULT_TRAINING_STEPS=25000
[ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || :
: "${SAMPLES:=${DEFAULT_SAMPLES}}"
: "${BATCH_SIZE:=${DEFAULT_BATCH_SIZE}}"
: "${TRAINING_STEPS:=${DEFAULT_TRAINING_STEPS}}"
: "${CLEANUP_WORK_DIR:=false}"
: "${CLEANUP_ARCHIVES:=false}"
: "${CLEANUP_INTERMEDIATE_FILES:=false}"
: "${QUIET:=false}"
: "${VERBOSE:=false}"
HELP=false
if [ -v KNOWN_ARGS ] ; then
KNOWN_ARGS+=( help verbose quiet h v q )
fi
declare -gi OPTION_COUNT=0
declare -ga POSITIONAL_ARGS=()
declare -ga EXTRA_ARGS=()
declare -ga UNKNOWN_ARGS=()
declare -i __stop_parsing=0
for a in "$@"; do
if [ "$a" == "--" ] ; then
__stop_parsing=1
shift
continue
fi
if [ $__stop_parsing == 1 ] ; then
EXTRA_ARGS+=( "$a" )
shift
continue
fi
if [ -v KNOWN_ARGS ] && [[ "${a}" =~ ^--?([^=]+)=?.* ]] ; then
_arg=${BASH_REMATCH[1]}
known=false
for _k in "${KNOWN_ARGS[@]}" ; do
[ "${_arg}" == "${_k}" ] && { known=true ; break ; } || :
done
$known || UNKNOWN_ARGS+=( "${a}" )
fi
OPTION_COUNT+=1
case "$a" in
-h | --help)
HELP=true
break
;;
-q | --quiet)
QUIET=true
break
;;
-v | --verbose)
VERBOSE=true
break
;;
--*=*)
[[ $a =~ --([^=]+)=(.*) ]]
l=${BASH_REMATCH[1]//-/_}
declare -n var="${l^^}"
var="${BASH_REMATCH[2]}"
;;
--no-*)
[[ $a =~ --no-(.+) ]]
l=${BASH_REMATCH[1]//-/_}
declare -n var="${l^^}"
var=false
;;
--*)
[[ $a =~ --(.+) ]]
l=${BASH_REMATCH[1]//-/_}
declare -n var="${l^^}"
var=true
;;
*)
POSITIONAL_ARGS+=( "$a" )
;;
esac
done
print_elapsed_time() {
print_seps=True
if [ "$1" == "--no-separators" ] ; then
shift
print_seps=False
fi
local START_TS=${1:?"Usage: $0 <start_timestamp> <end_timestamp>"}
local END_TS=${2:?"Usage: $0 <start_timestamp> <end_timestamp>"}
message="${3}"
python <<EOF
from datetime import datetime
st=datetime.fromtimestamp(int($START_TS))
et=datetime.fromtimestamp(int($END_TS))
msg=f"${message} Elapsed time: {et-st!s}"
if ${print_seps}:
print(f"{'=' * 80}")
print(f"{msg:>80s}")
if ${print_seps}:
print(f"{'=' * 80}")
EOF
}
justify_text() {
msg="${1:?Need a string}"
len="${2:?Need a length}"
printf "%*s\n" $(( (${#msg}+len)/2)) "${msg}"
}
get_filecounts() {
declare -ln fca=${1}
local af=${2}
if [ -f "${af}" ] ; then
mapfile -t fc < <(cat "${af}")
for ds in "${fc[@]}" ; do
[[ "${ds}" =~ ^([^:]+):([0-9-]+)$ ]] && fca[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} || :
done
fi
}
get_total_filecount() {
declare -ln fca=${1}
declare -li total=0
for ds in "${fca[@]}" ; do
total+=${ds}
done
echo $total
}
write_filecounts() {
declare -ln fca=${1}
local af=${2}
rm -rf "${af}" || :
for ds in "${!fca[@]}" ; do
echo "${ds}:${fca[${ds}]}" >> "${af}"
done
}