mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
The files in the `cli` directory allow you to train wake words from the command line without needing to use the Jupyter notebook or a web browser. Basically, the logic from the notebook has been placed in separate shell scripts and python files wrapped by 3 high-level scripts that do the following: * setup_python_venv: Creates a Python virtual environment with all the packages needed to train. The venv is created in the container's /data directory and is therefore stored on the host, not in the container's root docker volume. * setup_training_datasets: Downloads, extracts and converts the MIT RIR, FMA, Audioset and Negative training reference datasets. Also stored in /data. * train_wake_word: Generates the wake word samples, augments them with the audio from the training datasets, and finally runs the microwakeword training. The resulting model tflite and json files are placed in the /data/output directory. See the README.md file for much more information.
176 lines
5.3 KiB
Bash
Executable File
176 lines
5.3 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
PROGPATH=$(realpath "$0")
|
|
PROGDIR=$(dirname "${PROGPATH}")
|
|
|
|
source "${PROGDIR}/shell.functions"
|
|
|
|
if [ "${HELP}" == "true" ] ; then
|
|
cat <<EOF >&2
|
|
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
|
|
|
--cleanup-archives : Automatically clean up any downloaded archvies after
|
|
extraction.
|
|
--cleanup-intermediate-files
|
|
: Automatically clean up the intermediate files after they've
|
|
: converted to 16k.
|
|
<data_dir> : Path to the data directory.
|
|
: Default: ${DATA_DIR}
|
|
|
|
EOF
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
|
cd "${DATA_DIR}/training_datasets"
|
|
|
|
echo "***** Checking audioset *****"
|
|
|
|
AUDIO_URL="https://huggingface.co/datasets/agkphysics/AudioSet/resolve"
|
|
AUDIO_DIR="./audioset"
|
|
mkdir -p "${AUDIO_DIR}"
|
|
AUDIO16K_DIR="./audioset_16k"
|
|
mkdir -p "${AUDIO16K_DIR}"
|
|
AUDIO_FILECOUNT="./downloads/audioset_filecount"
|
|
AUDIO_IN_GLOB="*.flac"
|
|
|
|
declare -A filecounts
|
|
for i in {0..9} ; do
|
|
fname="bal_train0${i}.tar"
|
|
filecounts[${fname}]=0
|
|
done
|
|
|
|
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
|
|
|
|
|
REV_CANDIDATES=(
|
|
"6762f044d1c88619c7f2006486036192128fb07e"
|
|
"0049167e89f259a010c3f070fe3666d9e5242836"
|
|
"ceb9eaaa7844c9ad7351e659c84a572e376ad06d"
|
|
"main"
|
|
)
|
|
|
|
TAR_PATTERNS=(
|
|
"data/bal_train0"
|
|
"data/bal_train/bal_train0"
|
|
)
|
|
|
|
find_rev() {
|
|
for rev in "${REV_CANDIDATES[@]}" ; do
|
|
for pattern in "${TAR_PATTERNS[@]}" ; do
|
|
url="https://huggingface.co/datasets/agkphysics/AudioSet/resolve/${rev}/${pattern}0.tar"
|
|
curl -I -L --fail -s "${url}" > /dev/null && echo "${rev},${pattern}"
|
|
done
|
|
done
|
|
echo ""
|
|
}
|
|
|
|
converter() {
|
|
source ${DATA_DIR}/.venv/bin/activate
|
|
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
|
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
|
from pathlib import Path
|
|
import soundfile as sf
|
|
import librosa
|
|
from tqdm import tqdm
|
|
|
|
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
|
x = np.clip(data, -1.0, 1.0)
|
|
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
|
|
|
audioset_dir = Path(sys.argv[1])
|
|
audioset_out = Path(sys.argv[2])
|
|
|
|
# convert FLAC → 16k mono WAV
|
|
flacs = list(audioset_dir.rglob("*.flac"))
|
|
print(f" FLAC files: {len(flacs)}")
|
|
audioset_bad = []
|
|
ok = 0
|
|
for p in tqdm(flacs, desc=" AudioSet→WAV (resample 16k mono)"):
|
|
try:
|
|
outfile = Path(audioset_out / (p.stem + ".wav"))
|
|
if outfile.exists():
|
|
continue
|
|
y, _ = librosa.load(p, sr=16000, mono=True)
|
|
if y.size == 0:
|
|
raise ValueError("empty audio")
|
|
write_wav(outfile, y, 16000)
|
|
ok += 1
|
|
except Exception as e:
|
|
audioset_bad.append(f"{p}:{e}")
|
|
|
|
if audioset_bad:
|
|
(audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
|
|
print(f" AudioSet complete ({ok} ok, {len(audioset_bad)} failed)")
|
|
EOF
|
|
}
|
|
|
|
expected_filecount=$(get_total_filecount filecounts)
|
|
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
|
write_filecount=false
|
|
|
|
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
|
echo " Existing Audioset valid"
|
|
else
|
|
dl=$(find_rev)
|
|
[ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
|
|
rev=${dl%%,*}
|
|
pattern=${dl##*,}
|
|
echo " Checking 10 tarballs"
|
|
for i in {0..9} ; do
|
|
fname="downloads/bal_train0${i}.tar"
|
|
if [ ! -f "${fname}" ] ; then
|
|
echo " Downloading bal_train0${i}.tar"
|
|
url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
|
|
curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
|
|
fi
|
|
|
|
tarball_filecount=$(tar -tvf "${fname}" | wc -l )
|
|
filecounts["bal_train0${i}.tar"]=${tarball_filecount}
|
|
write_filecount=true
|
|
|
|
echo " Untarring bal_train0${i}.tar"
|
|
tar -xf "${fname}" -C "${AUDIO_DIR}"
|
|
if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
|
|
echo " Cleaning up bal_train0${i}.tar"
|
|
rm -rf "${fname}"
|
|
fi
|
|
done
|
|
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
|
|
converter
|
|
if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then
|
|
failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l)
|
|
filecounts[failed]=-${failed}
|
|
fi
|
|
expected_filecount=$(get_total_filecount filecounts)
|
|
actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || :
|
|
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
|
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
if ${write_filecount} ; then
|
|
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
|
fi
|
|
|
|
if "${CLEANUP_ARCHIVES}" ; then
|
|
for i in {0..9} ; do
|
|
fname="downloads/bal_train0${i}.tar"
|
|
if [ -f "${fname}" ] ; then
|
|
echo " Cleaning up bal_train0${i}.tar"
|
|
rm -rf "${fname}"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
|
|
echo " Cleaning up ${AUDIO_DIR}"
|
|
rm -rf "${AUDIO_DIR}"
|
|
fi
|
|
|
|
echo " Audioset complete"
|
|
exit 0
|
|
|