#!/bin/bash set -euo pipefail PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") source "${PROGDIR}/shell.functions" if [ "${HELP}" == "true" ] ; then cat <&2 Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir= ] --cleanup-archives : Automatically clean up any downloaded archvies after extraction. --cleanup-intermediate-files : Automatically clean up the intermediate files after they've : converted to 16k. : Path to the data directory. : Default: ${DATA_DIR} EOF exit 1 fi mkdir -p "${DATA_DIR}/training_datasets/downloads" || : cd "${DATA_DIR}/training_datasets" echo "***** Checking FMA *****" AUDIO_URLS=( "https://os.unil.cloud.switch.ch/fma/fma_small.zip" "https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip" ) AUDIO_ZIPFILE="fma_small.zip" AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}" AUDIO_DIR="fma" mkdir -p "${AUDIO_DIR}" || : AUDIO16K_DIR="fma_16k" mkdir -p "${AUDIO16K_DIR}" || : AUDIO_FILECOUNT="./downloads/fma_filecount" AUDIO_IN_GLOB="*.mp3" declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 ) get_filecounts filecounts "${AUDIO_FILECOUNT}" converter() { source ${DATA_DIR}/.venv/bin/activate python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF import os, sys, subprocess, scipy.io.wavfile, numpy as np from pathlib import Path import soundfile as sf import librosa from tqdm import tqdm def write_wav(dst: Path, data: np.ndarray, sr: int): x = np.clip(data, -1.0, 1.0) scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16)) fma_dir = Path(sys.argv[1]) fma_out = Path(sys.argv[2]) # convert MP3 → 16k mono WAV mp3s = list(fma_dir.rglob("*.mp3")) print(f" MP3 files: {len(mp3s)}") fma_bad = [] ok = 0 for p in tqdm(mp3s, desc=" FMA→WAV (resample 16k mono)"): try: outfile = Path(fma_out / (p.stem + ".wav")) if outfile.exists(): continue y, _ = librosa.load(p, sr=16000, mono=True) if y.size == 0: raise ValueError("empty audio") write_wav(outfile, y, 16000) ok += 1 except Exception as e: fma_bad.append(f"{p}:{e}") if fma_bad: (fma_out / "fma_corrupted_files.log").write_text("\n".join(fma_bad)) print(f" FMA complete ({ok} ok, {len(fma_bad)} failed)") EOF } extract_zip_with_python() { local zip_path="$1" local dest_dir="$2" "${DATA_DIR}/.venv/bin/python" - "${zip_path}" "${dest_dir}" <<-'EOF' import sys import zipfile from pathlib import Path from tqdm import tqdm zip_path = Path(sys.argv[1]) dest_dir = Path(sys.argv[2]) if (not zip_path.exists()) or zip_path.stat().st_size == 0: raise SystemExit(f"Archive missing or empty: {zip_path}") with zipfile.ZipFile(zip_path, "r") as zf: members = zf.infolist() size_gb = zip_path.stat().st_size / (1024 ** 3) print(f" Extracting {zip_path.name} ({len(members)} entries, {size_gb:.1f} GiB)...") for member in tqdm(members, desc=" FMA zip extract", unit="file"): zf.extract(member, dest_dir) EOF } download_with_fallbacks() { local output="$1" shift local urls=( "$@" ) local rc=1 for url in "${urls[@]}" ; do for attempt in 1 2 3 4 ; do curl -sfL "${url}" -o "${output}" && [ -s "${output}" ] && return 0 rc=$? rm -f "${output}" || : if [ "${attempt}" -lt 4 ] ; then echo " Retry ${attempt}/3 after download failure" sleep $(( attempt * 2 )) fi done done return "${rc}" } expected_filecount=${filecounts[${AUDIO_ZIPFILE}]} actual_filecount=$(find ${AUDIO16K_DIR} -name '*.wav' 2>/dev/null | wc -l) || : write_filecount=false if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then echo " Existing FMA valid" else actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || : if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then if [ ! -f "${AUDIO_ZIP}" ] ; then echo " Downloading ${AUDIO_ZIPFILE}" download_with_fallbacks "${AUDIO_ZIP}" "${AUDIO_URLS[@]}" || { echo " Failed to download ${AUDIO_ZIPFILE} from all configured sources." >&2 exit 1 } fi rm -rf "${AUDIO_DIR}" || : mkdir "${AUDIO_DIR}" echo " Extracting ${AUDIO_ZIPFILE}" extract_zip_with_python "${AUDIO_ZIP}" "${AUDIO_DIR}" fi if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then echo " Cleaning up ${AUDIO_ZIPFILE}" rm -rf "${AUDIO_ZIP}" fi converter actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || : filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}" write_filecount=true fi if ${write_filecount} ; then write_filecounts filecounts "${AUDIO_FILECOUNT}" fi if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then echo " Cleaning up ${AUDIO_ZIPFILE}" rm -rf "${AUDIO_ZIP}" fi if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then echo " Cleaning up ${AUDIO_DIR}" rm -rf "${AUDIO_DIR}" fi echo " FMA complete" exit 0