Files
microWakeWord-Trainer-Nvidi…/cli/setup_wham
2026-03-09 19:48:35 -05:00

143 lines
4.1 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
PROGPATH=$(realpath "$0")
PROGDIR=$(dirname "${PROGPATH}")
source "${PROGDIR}/shell.functions"
if [ "${HELP}" == "true" ] ; then
cat <<EOF >&2
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
--cleanup-archives
: Automatically clean up any downloaded archives after
: extraction.
--cleanup-intermediate-files
: Automatically clean up intermediate extracted files
: after conversion to 16k.
<data_dir> : Path to the data directory.
: Default: ${DATA_DIR}
EOF
exit 1
fi
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
cd "${DATA_DIR}/training_datasets"
echo "***** Checking WHAM *****"
AUDIO_URL="https://my-bucket-a8b4b49c25c811ee9a7e8bba05fa24c7.s3.amazonaws.com/wham_noise.zip"
AUDIO_ZIPFILE="wham_noise.zip"
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
AUDIO_DIR="./wham"
mkdir -p "${AUDIO_DIR}" || :
AUDIO16K_DIR="./wham_16k"
mkdir -p "${AUDIO16K_DIR}" || :
AUDIO_FILECOUNT="./downloads/wham_filecount"
AUDIO_IN_GLOB="*.wav"
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
get_filecounts filecounts "${AUDIO_FILECOUNT}"
converter() {
source "${DATA_DIR}/.venv/bin/activate"
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
import os, sys
from pathlib import Path
import numpy as np
import scipy.io.wavfile
import librosa
from tqdm import tqdm
def write_wav(dst: Path, data: np.ndarray, sr: int):
dst.parent.mkdir(parents=True, exist_ok=True)
x = np.clip(data, -1.0, 1.0)
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
def flatten_name(root: Path, src: Path) -> str:
rel = src.relative_to(root)
return "__".join(rel.parts)
wham_in = Path(sys.argv[1]).resolve()
wham_out = Path(sys.argv[2]).resolve()
wavs = list(wham_in.rglob("*.wav"))
print(f" WAV files: {len(wavs)}")
print(" Converting WHAM -> 16k mono WAV")
bad = []
ok = 0
skipped = 0
for p in tqdm(wavs, desc=" WHAM -> WAV (resample 16k mono)"):
try:
out_name = flatten_name(wham_in, p)
outfile = wham_out / out_name
if outfile.exists():
skipped += 1
continue
y, _ = librosa.load(p, sr=16000, mono=True)
if y.size == 0:
raise ValueError("empty audio")
write_wav(outfile, y, 16000)
ok += 1
except Exception as e:
bad.append(f"{p}:{e}")
if bad:
(wham_out / "wham_corrupted_files.log").write_text("\\n".join(bad))
print(f" WHAM complete ({ok} ok, {skipped} skipped, {len(bad)} failed)")
EOF
}
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || :
write_filecount=false
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
echo " Existing ${AUDIO16K_DIR} valid"
else
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
if [ ! -f "${AUDIO_ZIP}" ] ; then
echo " Downloading ${AUDIO_ZIPFILE}"
curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}"
fi
rm -rf "${AUDIO_DIR}" || :
mkdir -p "${AUDIO_DIR}" || :
echo " Unzipping ${AUDIO_ZIPFILE}"
unzip -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}"
fi
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
converter
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
write_filecount=true
fi
if ${write_filecount} ; then
write_filecounts filecounts "${AUDIO_FILECOUNT}"
fi
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
echo " Cleaning up ${AUDIO_ZIPFILE}"
rm -rf "${AUDIO_ZIP}"
fi
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
echo " Cleaning up ${AUDIO_DIR}"
rm -rf "${AUDIO_DIR}"
fi
echo " WHAM complete"
exit 0