thanks @ mitrokun
This commit is contained in:
Tater Totterson
2025-11-02 04:44:24 -06:00
committed by GitHub
parent c9aeb17091
commit 31677fa74b

View File

@@ -32,7 +32,7 @@
"# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
"\n",
"# --- Set your wake word here ---\n",
"TARGET_WORD = \"hey_tater\" # 🗣️ Change this to whatever phrase you want!\n",
"TARGET_WORD = \"tater\" # 🗣️ Change this to whatever phrase you want!\n",
"print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")"
]
},
@@ -56,18 +56,21 @@
"# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n",
"!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
"\n",
"# Clone the microWakeWord repository\n",
"# Clone the microWakeWord repository (your fork)\n",
"repo_path = \"./microWakeWord\"\n",
"if not os.path.exists(repo_path):\n",
" print(\"Cloning microWakeWord repository...\")\n",
" !git clone https://github.com/kahrendt/microWakeWord.git {repo_path}\n",
" print(\"⬇️ Cloning microWakeWord repository...\")\n",
" !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n",
"\n",
"# Ensure the repository exists before attempting to install\n",
"# Optionally pin to a specific commit for reproducibility\n",
"os.system(f\"cd {repo_path} && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\")\n",
"\n",
"# Install editable\n",
"if os.path.exists(repo_path):\n",
" print(\"Installing microWakeWord...\")\n",
" print(\"📦 Installing microWakeWord...\")\n",
" !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
"else:\n",
" print(f\"Repository not found at {repo_path}. Cloning might have failed.\")"
" print(f\"Repository not found at {repo_path}. Clone might have failed.\")"
]
},
{
@@ -261,18 +264,28 @@
},
"outputs": [],
"source": [
"# NVIDIA/Linux dataset prep to match the Apple behavior, but without datasets.Audio (no TorchCodec)\n",
"# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n",
"# MIT RIR -> resample to 16 kHz\n",
"# AudioSet -> NO resample\n",
"# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n",
"# FMA -> resample to 16 kHz mono\n",
"\n",
"import os, sys, scipy.io.wavfile, numpy as np\n",
"import os, sys, subprocess, scipy.io.wavfile, numpy as np\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import soundfile as sf\n",
"import librosa\n",
"from datasets import load_dataset\n",
"\n",
"# -------------------------------------------------\n",
"# small shell helpers (for curl/tar probing)\n",
"# -------------------------------------------------\n",
"def sh(cmd: str) -> int:\n",
" return subprocess.call(cmd, shell=True)\n",
"\n",
"def curl(url: str, out: Path) -> int:\n",
" # -L follow, -s silent, --fail to get nonzero on 404\n",
" return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n",
"\n",
"def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
" x = np.clip(data, -1.0, 1.0)\n",
" scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
@@ -287,10 +300,13 @@
" ok = 0\n",
" try:\n",
" # Avoid datasets.Audio to keep TorchCodec out:\n",
" # Use streaming=True + Audio(decode=False)-equivalent: access raw file path and decode with librosa\n",
" # Use streaming=True + manual decode with librosa\n",
" print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n",
" ds = load_dataset(\"davidscripka/MIT_environmental_impulse_responses\",\n",
" split=\"train\", streaming=True)\n",
" ds = load_dataset(\n",
" \"davidscripka/MIT_environmental_impulse_responses\",\n",
" split=\"train\",\n",
" streaming=True\n",
" )\n",
" for i, row in enumerate(tqdm(ds)):\n",
" try:\n",
" audio_path = row[\"audio\"][\"path\"]\n",
@@ -302,7 +318,7 @@
" print(f\"✅ MIT RIR saved: {ok} files\")\n",
" except Exception as e:\n",
" print(f\"⚠️ MIT RIR download failed: {e}\")\n",
" # Fallback to official ZIP if needed (rare)\n",
" # Fallback ZIP route\n",
" try:\n",
" print(\"⬇️ MIT RIR (fallback ZIP)…\")\n",
" zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
@@ -313,7 +329,8 @@
" # Normalize to 16k mono\n",
" for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
" a, sr = sf.read(p, always_2d=False)\n",
" if a.ndim > 1: a = a[:,0]\n",
" if a.ndim > 1:\n",
" a = a[:, 0]\n",
" if sr != 16000:\n",
" a, _ = librosa.load(p, sr=16000, mono=True)\n",
" write_wav(p, a, 16000)\n",
@@ -323,41 +340,80 @@
"else:\n",
" print(\"✅ mit_rirs exists; skipping.\")\n",
"\n",
"# -----------------------------\n",
"# AudioSet (NO resample — fast)\n",
"# -----------------------------\n",
"print(\"\\n=== AudioSet subset ===\")\n",
"# ============================================================\n",
"# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n",
"# ============================================================\n",
"print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n",
"audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
"audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
"\n",
"links = [f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train0{i}.tar\"\n",
" for i in range(10)]\n",
"for link in links:\n",
" fname = link.split(\"/\")[-1]\n",
"if any(audioset_out.rglob(\"*.wav\")):\n",
" print(\"✅ audioset_16k exists; skipping.\")\n",
"else:\n",
" # commits / refs we know about — well probe them\n",
" REV_CANDIDATES = [\n",
" \"6762f044d1c88619c7f2006486036192128fb07e\",\n",
" \"0049167e89f259a010c3f070fe3666d9e5242836\",\n",
" \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n",
" \"main\", # last resort\n",
" ]\n",
" # possible folder layouts\n",
" TAR_PATTERNS = [\n",
" \"data/bal_train0{idx}.tar\",\n",
" \"data/bal_train/bal_train0{idx}.tar\",\n",
" ]\n",
"\n",
" def find_working_rev():\n",
" for rev in REV_CANDIDATES:\n",
" for pat in TAR_PATTERNS:\n",
" probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n",
" rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n",
" if rc == 0:\n",
" return rev, pat\n",
" return None, None\n",
"\n",
" rev, pattern = find_working_rev()\n",
" if rev is None:\n",
" raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n",
"\n",
" print(f\"📌 Using AudioSet revision: {rev}\")\n",
" print(f\"🗂️ Tar layout pattern: {pattern}\")\n",
"\n",
" # download + extract bal_train00..09\n",
" for i in range(10):\n",
" rel = pattern.format(idx=i)\n",
" url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n",
" fname = rel.split(\"/\")[-1]\n",
" out_tar = audioset_dir / fname\n",
" if not out_tar.exists():\n",
" print(f\"⬇️ {fname}\")\n",
" os.system(f\"wget -q -O '{out_tar}' '{link}'\")\n",
" rc = curl(url, out_tar)\n",
" if rc != 0:\n",
" print(f\"⚠️ Could not fetch {fname} at rev {rev}; continuing.\")\n",
" continue\n",
" print(f\"📦 Extract {fname}\")\n",
" os.system(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
" rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
" if rc != 0:\n",
" print(f\"⚠️ tar extract failed for {fname}; continuing.\")\n",
"\n",
"flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
"print(f\"🔎 FLAC files: {len(flacs)}\")\n",
"corrupt = []\n",
"for p in tqdm(flacs, desc=\"AudioSet→WAV (no resample)\"):\n",
" # convert FLAC → 16k mono WAV\n",
" flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
" print(f\"🔎 FLAC files: {len(flacs)}\")\n",
" audioset_bad = []\n",
" ok = 0\n",
" for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n",
" try:\n",
" a, sr = sf.read(p, always_2d=False)\n",
" if a is None or len(a) == 0:\n",
" y, _ = librosa.load(p, sr=16000, mono=True)\n",
" if y.size == 0:\n",
" raise ValueError(\"empty audio\")\n",
" if a.ndim > 1:\n",
" a = a[:,0]\n",
" # Apple behavior: write as 16-bit and label 16 kHz (no resample)\n",
" write_wav(audioset_out / (p.stem + \".wav\"), a, 16000)\n",
" write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n",
" ok += 1\n",
" except Exception as e:\n",
" corrupt.append(f\"{p}:{e}\")\n",
"if corrupt:\n",
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
"print(\"✅ AudioSet processing complete!\")\n",
" audioset_bad.append(f\"{p}:{e}\")\n",
"\n",
" if audioset_bad:\n",
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n",
" print(f\"✅ AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n",
"\n",
"# -----------------------------\n",
"# FMA xsmall (resample to 16 kHz mono)\n",
@@ -378,7 +434,7 @@
"corrupt = []\n",
"for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n",
" try:\n",
" y, sr = librosa.load(p, sr=16000, mono=True) # proper decode+resample\n",
" y, sr = librosa.load(p, sr=16000, mono=True)\n",
" if y.size == 0:\n",
" raise ValueError(\"empty audio\")\n",
" write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
@@ -386,6 +442,7 @@
" corrupt.append(f\"{p}:{e}\")\n",
"if corrupt:\n",
" Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
"\n",
"print(\"\\n✅ Dataset prep complete!\")"
]
},