Automatic Calibration

This commit is contained in:
MasterPhooey
2026-04-18 09:01:40 -05:00
parent 2da9f7a686
commit 6e7396455a
10 changed files with 656 additions and 78 deletions

View File

@@ -130,6 +130,73 @@ print(f" AudioSet complete ({ok} ok, {skipped} skipped, {len(audioset_bad)} fa
EOF
}
converter_from_dataset_api() {
# shellcheck source=/dev/null
source "${DATA_DIR}/.venv/bin/activate"
python - "${AUDIO16K_DIR}" <<-'EOF'
import sys
from pathlib import Path
import librosa
import numpy as np
import scipy.io.wavfile
from datasets import load_dataset
def write_wav(dst: Path, data: np.ndarray, sr: int):
dst.parent.mkdir(parents=True, exist_ok=True)
x = np.clip(data, -1.0, 1.0)
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
audioset_out = Path(sys.argv[1])
print(" AudioSet FLAC tarballs are unavailable; using Hugging Face datasets API instead.")
dataset = load_dataset(
"agkphysics/AudioSet",
"balanced",
split="train",
streaming=True,
)
audioset_bad = []
ok = 0
skipped = 0
heartbeat_every = 250
for idx, sample in enumerate(dataset, start=1):
try:
video_id = str(sample.get("video_id") or f"audioset_{idx:06d}")
outfile = audioset_out / f"{video_id}.wav"
if outfile.exists():
skipped += 1
continue
audio = sample.get("audio") or {}
y = np.asarray(audio.get("array"))
sr = int(audio.get("sampling_rate") or 0)
if y.size == 0 or sr <= 0:
raise ValueError("missing decoded audio")
if y.ndim > 1:
y = np.mean(y, axis=-1)
if sr != 16000:
y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=16000)
if y.size == 0:
raise ValueError("empty audio")
write_wav(outfile, y, 16000)
ok += 1
except Exception as exc:
audioset_bad.append(f"{sample.get('video_id', idx)}:{exc}")
if idx == 1 or (idx % heartbeat_every) == 0:
print(f" AudioSet API progress: {idx} clips processed (ok={ok}, skipped={skipped}, failed={len(audioset_bad)})")
if audioset_bad:
(audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
print(f" AudioSet complete via datasets API ({ok} ok, {skipped} skipped, {len(audioset_bad)} failed)")
EOF
}
expected_filecount=$(get_total_filecount filecounts)
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
write_filecount=false
@@ -139,40 +206,44 @@ if [ "${actual_filecount}" -ne 0 ] ; then
echo " Existing ${AUDIO16K_DIR} present (${actual_filecount} wav); skipping extract/convert"
else
dl=$(find_rev)
[ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
rev=${dl%%,*}
pattern=${dl##*,}
if [ -z "$dl" ] ; then
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
converter_from_dataset_api
else
rev=${dl%%,*}
pattern=${dl##*,}
echo " Checking 10 tarballs"
for i in {0..9} ; do
fname="downloads/bal_train0${i}.tar"
if [ ! -f "${fname}" ] ; then
echo " Downloading bal_train0${i}.tar"
url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
echo " Checking 10 tarballs"
for i in {0..9} ; do
fname="downloads/bal_train0${i}.tar"
if [ ! -f "${fname}" ] ; then
echo " Downloading bal_train0${i}.tar"
url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
fi
tarball_filecount=$(tar -tvf "${fname}" | wc -l )
filecounts["bal_train0${i}.tar"]=${tarball_filecount}
write_filecount=true
echo " Untarring bal_train0${i}.tar"
tar -xf "${fname}" -C "${AUDIO_DIR}"
if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
echo " Cleaning up bal_train0${i}.tar"
rm -rf "${fname}"
fi
done
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
converter
# Recompute counts and warn (but do not fail)
expected_filecount=$(get_total_filecount filecounts)
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
echo " WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
fi
tarball_filecount=$(tar -tvf "${fname}" | wc -l )
filecounts["bal_train0${i}.tar"]=${tarball_filecount}
write_filecount=true
echo " Untarring bal_train0${i}.tar"
tar -xf "${fname}" -C "${AUDIO_DIR}"
if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
echo " Cleaning up bal_train0${i}.tar"
rm -rf "${fname}"
fi
done
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
converter
# Recompute counts and warn (but do not fail)
expected_filecount=$(get_total_filecount filecounts)
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
echo " WARNING: mismatch is expected if some AudioSet files are corrupted; continuing." >&2
fi
fi
@@ -196,4 +267,4 @@ if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
fi
echo " Audioset complete"
exit 0
exit 0