mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
183 lines
5.3 KiB
Bash
Executable File
183 lines
5.3 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
PROGPATH=$(realpath "$0")
|
|
PROGDIR=$(dirname "${PROGPATH}")
|
|
|
|
source "${PROGDIR}/shell.functions"
|
|
|
|
if [ "${HELP}" == "true" ] ; then
|
|
cat <<EOF >&2
|
|
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
|
|
|
--cleanup-archives : Automatically clean up any downloaded archvies after
|
|
extraction.
|
|
--cleanup-intermediate-files
|
|
: Automatically clean up the intermediate files after they've
|
|
: converted to 16k.
|
|
<data_dir> : Path to the data directory.
|
|
: Default: ${DATA_DIR}
|
|
|
|
EOF
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
|
cd "${DATA_DIR}/training_datasets"
|
|
|
|
echo "***** Checking FMA *****"
|
|
|
|
AUDIO_URLS=(
|
|
"https://os.unil.cloud.switch.ch/fma/fma_small.zip"
|
|
"https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip"
|
|
)
|
|
AUDIO_ZIPFILE="fma_small.zip"
|
|
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
|
AUDIO_DIR="fma"
|
|
mkdir -p "${AUDIO_DIR}" || :
|
|
AUDIO16K_DIR="fma_16k"
|
|
mkdir -p "${AUDIO16K_DIR}" || :
|
|
AUDIO_FILECOUNT="./downloads/fma_filecount"
|
|
AUDIO_IN_GLOB="*.mp3"
|
|
|
|
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
|
|
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
|
|
|
converter() {
|
|
source ${DATA_DIR}/.venv/bin/activate
|
|
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
|
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
|
from pathlib import Path
|
|
import soundfile as sf
|
|
import librosa
|
|
from tqdm import tqdm
|
|
|
|
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
|
x = np.clip(data, -1.0, 1.0)
|
|
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
|
|
|
fma_dir = Path(sys.argv[1])
|
|
fma_out = Path(sys.argv[2])
|
|
|
|
# convert MP3 → 16k mono WAV
|
|
mp3s = list(fma_dir.rglob("*.mp3"))
|
|
print(f" MP3 files: {len(mp3s)}")
|
|
fma_bad = []
|
|
ok = 0
|
|
for p in tqdm(mp3s, desc=" FMA→WAV (resample 16k mono)"):
|
|
try:
|
|
outfile = Path(fma_out / (p.stem + ".wav"))
|
|
if outfile.exists():
|
|
continue
|
|
y, _ = librosa.load(p, sr=16000, mono=True)
|
|
if y.size == 0:
|
|
raise ValueError("empty audio")
|
|
write_wav(outfile, y, 16000)
|
|
ok += 1
|
|
except Exception as e:
|
|
fma_bad.append(f"{p}:{e}")
|
|
|
|
if fma_bad:
|
|
(fma_out / "fma_corrupted_files.log").write_text("\n".join(fma_bad))
|
|
print(f" FMA complete ({ok} ok, {len(fma_bad)} failed)")
|
|
EOF
|
|
|
|
}
|
|
|
|
extract_zip_with_python() {
|
|
local zip_path="$1"
|
|
local dest_dir="$2"
|
|
|
|
"${DATA_DIR}/.venv/bin/python" - "${zip_path}" "${dest_dir}" <<-'EOF'
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
|
|
zip_path = Path(sys.argv[1])
|
|
dest_dir = Path(sys.argv[2])
|
|
|
|
if (not zip_path.exists()) or zip_path.stat().st_size == 0:
|
|
raise SystemExit(f"Archive missing or empty: {zip_path}")
|
|
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
members = zf.infolist()
|
|
size_gb = zip_path.stat().st_size / (1024 ** 3)
|
|
print(f" Extracting {zip_path.name} ({len(members)} entries, {size_gb:.1f} GiB)...")
|
|
for member in tqdm(members, desc=" FMA zip extract", unit="file"):
|
|
zf.extract(member, dest_dir)
|
|
EOF
|
|
}
|
|
|
|
download_with_fallbacks() {
|
|
local output="$1"
|
|
shift
|
|
local urls=( "$@" )
|
|
local rc=1
|
|
|
|
for url in "${urls[@]}" ; do
|
|
for attempt in 1 2 3 4 ; do
|
|
curl -sfL "${url}" -o "${output}" && [ -s "${output}" ] && return 0
|
|
rc=$?
|
|
rm -f "${output}" || :
|
|
if [ "${attempt}" -lt 4 ] ; then
|
|
echo " Retry ${attempt}/3 after download failure"
|
|
sleep $(( attempt * 2 ))
|
|
fi
|
|
done
|
|
done
|
|
|
|
return "${rc}"
|
|
}
|
|
|
|
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
|
actual_filecount=$(find ${AUDIO16K_DIR} -name '*.wav' 2>/dev/null | wc -l) || :
|
|
write_filecount=false
|
|
|
|
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
|
echo " Existing FMA valid"
|
|
else
|
|
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
|
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
|
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
|
echo " Downloading ${AUDIO_ZIPFILE}"
|
|
download_with_fallbacks "${AUDIO_ZIP}" "${AUDIO_URLS[@]}" || {
|
|
echo " Failed to download ${AUDIO_ZIPFILE} from all configured sources." >&2
|
|
exit 1
|
|
}
|
|
fi
|
|
|
|
rm -rf "${AUDIO_DIR}" || :
|
|
mkdir "${AUDIO_DIR}"
|
|
echo " Extracting ${AUDIO_ZIPFILE}"
|
|
extract_zip_with_python "${AUDIO_ZIP}" "${AUDIO_DIR}"
|
|
fi
|
|
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
|
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
|
rm -rf "${AUDIO_ZIP}"
|
|
fi
|
|
|
|
converter
|
|
|
|
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
|
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
|
write_filecount=true
|
|
fi
|
|
|
|
if ${write_filecount} ; then
|
|
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
|
fi
|
|
|
|
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
|
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
|
rm -rf "${AUDIO_ZIP}"
|
|
fi
|
|
|
|
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then
|
|
echo " Cleaning up ${AUDIO_DIR}"
|
|
rm -rf "${AUDIO_DIR}"
|
|
fi
|
|
|
|
echo " FMA complete"
|
|
exit 0
|