mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
blackwell/wham & chim datasets
This commit is contained in:
142
cli/setup_chime
Executable file
142
cli/setup_chime
Executable file
@@ -0,0 +1,142 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives
|
||||
: Automatically clean up any downloaded archives after
|
||||
: extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up intermediate extracted files
|
||||
: after conversion to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
echo "***** Checking CHiME-Home *****"
|
||||
|
||||
AUDIO_URL="https://archive.org/download/chime-home/chime_home.tar.gz"
|
||||
AUDIO_TARFILE="chime_home.tar.gz"
|
||||
AUDIO_TAR="./downloads/${AUDIO_TARFILE}"
|
||||
AUDIO_DIR="./chime"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
AUDIO16K_DIR="./chime_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}" || :
|
||||
AUDIO_FILECOUNT="./downloads/chime_filecount"
|
||||
AUDIO_IN_GLOB="*.48kHz.wav"
|
||||
|
||||
declare -A filecounts=( [${AUDIO_TARFILE}]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
converter() {
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import scipy.io.wavfile
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
def flatten_name(root: Path, src: Path) -> str:
|
||||
rel = src.relative_to(root)
|
||||
return "__".join(rel.parts)
|
||||
|
||||
chime_in = Path(sys.argv[1]).resolve()
|
||||
chime_out = Path(sys.argv[2]).resolve()
|
||||
|
||||
wavs = list(chime_in.rglob("*.48kHz.wav"))
|
||||
print(f" WAV files: {len(wavs)}")
|
||||
print(" Converting CHiME -> 16k mono WAV")
|
||||
|
||||
bad = []
|
||||
ok = 0
|
||||
skipped = 0
|
||||
for p in tqdm(wavs, desc=" CHiME -> WAV (resample 16k mono)"):
|
||||
try:
|
||||
out_name = flatten_name(chime_in, p)
|
||||
outfile = chime_out / out_name
|
||||
if outfile.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
y, _ = librosa.load(p, sr=16000, mono=True)
|
||||
if y.size == 0:
|
||||
raise ValueError("empty audio")
|
||||
write_wav(outfile, y, 16000)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
bad.append(f"{p}:{e}")
|
||||
|
||||
if bad:
|
||||
(chime_out / "chime_corrupted_files.log").write_text("\\n".join(bad))
|
||||
print(f" CHiME complete ({ok} ok, {skipped} skipped, {len(bad)} failed)")
|
||||
EOF
|
||||
}
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_TARFILE}]}
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing ${AUDIO16K_DIR} valid"
|
||||
else
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
if [ ! -f "${AUDIO_TAR}" ] ; then
|
||||
echo " Downloading ${AUDIO_TARFILE}"
|
||||
curl -sfL "${AUDIO_URL}" -o "${AUDIO_TAR}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
echo " Untarring ${AUDIO_TARFILE}"
|
||||
tar -xzf "${AUDIO_TAR}" -C "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_TAR}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_TARFILE}"
|
||||
rm -rf "${AUDIO_TAR}"
|
||||
fi
|
||||
|
||||
converter
|
||||
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_TARFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_TAR}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_TARFILE}"
|
||||
rm -rf "${AUDIO_TAR}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " CHiME complete"
|
||||
exit 0
|
||||
Reference in New Issue
Block a user