Automatic Calibration

2026-06-12 20:10:19 -06:00 · 2026-04-18 09:01:40 -05:00
parent 2da9f7a686
commit 6e7396455a
10 changed files with 656 additions and 78 deletions
--- a/cli/setup_audioset
+++ b/cli/setup_audioset
@@ -130,6 +130,73 @@ print(f"   AudioSet complete ({ok} ok, {skipped} skipped, {len(audioset_bad)} fa
 EOF
 }

+converter_from_dataset_api() {
+    # shellcheck source=/dev/null
+    source "${DATA_DIR}/.venv/bin/activate"
+
+    python - "${AUDIO16K_DIR}" <<-'EOF'
+import sys
+from pathlib import Path
+
+import librosa
+import numpy as np
+import scipy.io.wavfile
+from datasets import load_dataset
+
+def write_wav(dst: Path, data: np.ndarray, sr: int):
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    x = np.clip(data, -1.0, 1.0)
+    scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
+
+audioset_out = Path(sys.argv[1])
+
+print("   AudioSet FLAC tarballs are unavailable; using Hugging Face datasets API instead.")
+dataset = load_dataset(
+    "agkphysics/AudioSet",
+    "balanced",
+    split="train",
+    streaming=True,
+)
+
+audioset_bad = []
+ok = 0
+skipped = 0
+heartbeat_every = 250
+
+for idx, sample in enumerate(dataset, start=1):
+    try:
+        video_id = str(sample.get("video_id") or f"audioset_{idx:06d}")
+        outfile = audioset_out / f"{video_id}.wav"
+        if outfile.exists():
+            skipped += 1
+            continue
+
+        audio = sample.get("audio") or {}
+        y = np.asarray(audio.get("array"))
+        sr = int(audio.get("sampling_rate") or 0)
+        if y.size == 0 or sr <= 0:
+            raise ValueError("missing decoded audio")
+        if y.ndim > 1:
+            y = np.mean(y, axis=-1)
+        if sr != 16000:
+            y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=16000)
+        if y.size == 0:
+            raise ValueError("empty audio")
+        write_wav(outfile, y, 16000)
+        ok += 1
+    except Exception as exc:
+        audioset_bad.append(f"{sample.get('video_id', idx)}:{exc}")
+
+    if idx == 1 or (idx % heartbeat_every) == 0:
+        print(f"   AudioSet API progress: {idx} clips processed (ok={ok}, skipped={skipped}, failed={len(audioset_bad)})")
+
+if audioset_bad:
+    (audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
+
+print(f"   AudioSet complete via datasets API ({ok} ok, {skipped} skipped, {len(audioset_bad)} failed)")
+EOF
+}
+
 expected_filecount=$(get_total_filecount filecounts)
 actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
 write_filecount=false
@@ -139,40 +206,44 @@ if [ "${actual_filecount}" -ne 0 ] ; then
    echo "   Existing ${AUDIO16K_DIR} present (${actual_filecount} wav); skipping extract/convert"
 else
    dl=$(find_rev)
-    [ -n "$dl" ] || { echo "   Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
-    rev=${dl%%,*}
-    pattern=${dl##*,}
+    if [ -z "$dl" ] ; then
+        rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
+        converter_from_dataset_api
+    else
+        rev=${dl%%,*}
+        pattern=${dl##*,}

-    echo "   Checking 10 tarballs"
-    for i in {0..9} ; do
-        fname="downloads/bal_train0${i}.tar"
-        if [ ! -f "${fname}" ] ; then
-            echo "   Downloading bal_train0${i}.tar"
-            url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
-            curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
+        echo "   Checking 10 tarballs"
+        for i in {0..9} ; do
+            fname="downloads/bal_train0${i}.tar"
+            if [ ! -f "${fname}" ] ; then
+                echo "   Downloading bal_train0${i}.tar"
+                url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
+                curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
+            fi
+
+            tarball_filecount=$(tar -tvf "${fname}" | wc -l )
+            filecounts["bal_train0${i}.tar"]=${tarball_filecount}
+            write_filecount=true
+
+            echo "   Untarring bal_train0${i}.tar"
+            tar -xf "${fname}" -C "${AUDIO_DIR}"
+            if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
+                echo "   Cleaning up bal_train0${i}.tar"
+                rm -rf "${fname}"
+            fi
+        done
+
+        rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
+        converter
+
+        # Recompute counts and warn (but do not fail)
+        expected_filecount=$(get_total_filecount filecounts)
+        actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
+        if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
+            echo "   Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
+            echo "   WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
        fi
-
-        tarball_filecount=$(tar -tvf "${fname}" | wc -l )
-        filecounts["bal_train0${i}.tar"]=${tarball_filecount}
-        write_filecount=true
-
-        echo "   Untarring bal_train0${i}.tar"
-        tar -xf "${fname}" -C "${AUDIO_DIR}"
-        if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
-            echo "   Cleaning up bal_train0${i}.tar"
-            rm -rf "${fname}"
-        fi
-    done
-
-    rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
-    converter
-
-    # Recompute counts and warn (but do not fail)
-    expected_filecount=$(get_total_filecount filecounts)
-    actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
-    if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
-        echo "   Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
-        echo "   WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
    fi
 fi

@@ -196,4 +267,4 @@ if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
 fi

 echo "   Audioset complete"
-exit 0
+exit 0