#!/bin/bash set -euo pipefail PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") source "${PROGDIR}/shell.functions" if [ "${HELP}" == "true" ] ; then cat <&2 Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir= ] --cleanup-archives : Automatically clean up any downloaded archives after : extraction. --cleanup-intermediate-files : Automatically clean up intermediate extracted files : after conversion to 16k. : Path to the data directory. : Default: ${DATA_DIR} EOF exit 1 fi mkdir -p "${DATA_DIR}/training_datasets/downloads" || : cd "${DATA_DIR}/training_datasets" echo "***** Checking WHAM *****" AUDIO_URL="https://my-bucket-a8b4b49c25c811ee9a7e8bba05fa24c7.s3.amazonaws.com/wham_noise.zip" AUDIO_ZIPFILE="wham_noise.zip" AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" AUDIO_DIR="./wham" mkdir -p "${AUDIO_DIR}" || : AUDIO16K_DIR="./wham_16k" mkdir -p "${AUDIO16K_DIR}" || : AUDIO_FILECOUNT="./downloads/wham_filecount" AUDIO_IN_GLOB="*.wav" declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 ) get_filecounts filecounts "${AUDIO_FILECOUNT}" converter() { source "${DATA_DIR}/.venv/bin/activate" python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF import os, sys from pathlib import Path import numpy as np import scipy.io.wavfile import librosa from tqdm import tqdm def write_wav(dst: Path, data: np.ndarray, sr: int): dst.parent.mkdir(parents=True, exist_ok=True) x = np.clip(data, -1.0, 1.0) scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16)) def flatten_name(root: Path, src: Path) -> str: rel = src.relative_to(root) return "__".join(rel.parts) wham_in = Path(sys.argv[1]).resolve() wham_out = Path(sys.argv[2]).resolve() wavs = list(wham_in.rglob("*.wav")) print(f" WAV files: {len(wavs)}") print(" Converting WHAM -> 16k mono WAV") bad = [] ok = 0 skipped = 0 for p in tqdm(wavs, desc=" WHAM -> WAV (resample 16k mono)"): try: out_name = flatten_name(wham_in, p) outfile = wham_out / out_name if outfile.exists(): skipped += 1 continue y, _ = librosa.load(p, sr=16000, mono=True) if y.size == 0: raise ValueError("empty audio") write_wav(outfile, y, 16000) ok += 1 except Exception as e: bad.append(f"{p}:{e}") if bad: (wham_out / "wham_corrupted_files.log").write_text("\\n".join(bad)) print(f" WHAM complete ({ok} ok, {skipped} skipped, {len(bad)} failed)") EOF } expected_filecount=${filecounts[${AUDIO_ZIPFILE}]} actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || : write_filecount=false if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then echo " Existing ${AUDIO16K_DIR} valid" else actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || : if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then if [ ! -f "${AUDIO_ZIP}" ] ; then echo " Downloading ${AUDIO_ZIPFILE}" curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}" fi rm -rf "${AUDIO_DIR}" || : mkdir -p "${AUDIO_DIR}" || : echo " Unzipping ${AUDIO_ZIPFILE}" unzip -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}" fi if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then echo " Cleaning up ${AUDIO_ZIPFILE}" rm -rf "${AUDIO_ZIP}" fi converter actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || : filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}" write_filecount=true fi if ${write_filecount} ; then write_filecounts filecounts "${AUDIO_FILECOUNT}" fi if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then echo " Cleaning up ${AUDIO_ZIPFILE}" rm -rf "${AUDIO_ZIP}" fi if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then echo " Cleaning up ${AUDIO_DIR}" rm -rf "${AUDIO_DIR}" fi echo " WHAM complete" exit 0