thanks @ mitrokun
This commit is contained in:
Tater Totterson
2025-11-02 04:44:24 -06:00
committed by GitHub
parent c9aeb17091
commit 31677fa74b

View File

@@ -32,7 +32,7 @@
"# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n", "# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
"\n", "\n",
"# --- Set your wake word here ---\n", "# --- Set your wake word here ---\n",
"TARGET_WORD = \"hey_tater\" # 🗣️ Change this to whatever phrase you want!\n", "TARGET_WORD = \"tater\" # 🗣️ Change this to whatever phrase you want!\n",
"print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")" "print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")"
] ]
}, },
@@ -56,18 +56,21 @@
"# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n", "# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n",
"!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n", "!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
"\n", "\n",
"# Clone the microWakeWord repository\n", "# Clone the microWakeWord repository (your fork)\n",
"repo_path = \"./microWakeWord\"\n", "repo_path = \"./microWakeWord\"\n",
"if not os.path.exists(repo_path):\n", "if not os.path.exists(repo_path):\n",
" print(\"Cloning microWakeWord repository...\")\n", " print(\"⬇️ Cloning microWakeWord repository...\")\n",
" !git clone https://github.com/kahrendt/microWakeWord.git {repo_path}\n", " !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n",
"\n", "\n",
"# Ensure the repository exists before attempting to install\n", "# Optionally pin to a specific commit for reproducibility\n",
"os.system(f\"cd {repo_path} && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\")\n",
"\n",
"# Install editable\n",
"if os.path.exists(repo_path):\n", "if os.path.exists(repo_path):\n",
" print(\"Installing microWakeWord...\")\n", " print(\"📦 Installing microWakeWord...\")\n",
" !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n", " !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
"else:\n", "else:\n",
" print(f\"Repository not found at {repo_path}. Cloning might have failed.\")" " print(f\"Repository not found at {repo_path}. Clone might have failed.\")"
] ]
}, },
{ {
@@ -261,18 +264,28 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# NVIDIA/Linux dataset prep to match the Apple behavior, but without datasets.Audio (no TorchCodec)\n", "# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n",
"# MIT RIR -> resample to 16 kHz\n", "# MIT RIR -> resample to 16 kHz\n",
"# AudioSet -> NO resample\n", "# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n",
"# FMA -> resample to 16 kHz mono\n", "# FMA -> resample to 16 kHz mono\n",
"\n", "\n",
"import os, sys, scipy.io.wavfile, numpy as np\n", "import os, sys, subprocess, scipy.io.wavfile, numpy as np\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"from tqdm import tqdm\n", "from tqdm import tqdm\n",
"import soundfile as sf\n", "import soundfile as sf\n",
"import librosa\n", "import librosa\n",
"from datasets import load_dataset\n", "from datasets import load_dataset\n",
"\n", "\n",
"# -------------------------------------------------\n",
"# small shell helpers (for curl/tar probing)\n",
"# -------------------------------------------------\n",
"def sh(cmd: str) -> int:\n",
" return subprocess.call(cmd, shell=True)\n",
"\n",
"def curl(url: str, out: Path) -> int:\n",
" # -L follow, -s silent, --fail to get nonzero on 404\n",
" return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n",
"\n",
"def write_wav(dst: Path, data: np.ndarray, sr: int):\n", "def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
" x = np.clip(data, -1.0, 1.0)\n", " x = np.clip(data, -1.0, 1.0)\n",
" scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n", " scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
@@ -287,10 +300,13 @@
" ok = 0\n", " ok = 0\n",
" try:\n", " try:\n",
" # Avoid datasets.Audio to keep TorchCodec out:\n", " # Avoid datasets.Audio to keep TorchCodec out:\n",
" # Use streaming=True + Audio(decode=False)-equivalent: access raw file path and decode with librosa\n", " # Use streaming=True + manual decode with librosa\n",
" print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n", " print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n",
" ds = load_dataset(\"davidscripka/MIT_environmental_impulse_responses\",\n", " ds = load_dataset(\n",
" split=\"train\", streaming=True)\n", " \"davidscripka/MIT_environmental_impulse_responses\",\n",
" split=\"train\",\n",
" streaming=True\n",
" )\n",
" for i, row in enumerate(tqdm(ds)):\n", " for i, row in enumerate(tqdm(ds)):\n",
" try:\n", " try:\n",
" audio_path = row[\"audio\"][\"path\"]\n", " audio_path = row[\"audio\"][\"path\"]\n",
@@ -302,7 +318,7 @@
" print(f\"✅ MIT RIR saved: {ok} files\")\n", " print(f\"✅ MIT RIR saved: {ok} files\")\n",
" except Exception as e:\n", " except Exception as e:\n",
" print(f\"⚠️ MIT RIR download failed: {e}\")\n", " print(f\"⚠️ MIT RIR download failed: {e}\")\n",
" # Fallback to official ZIP if needed (rare)\n", " # Fallback ZIP route\n",
" try:\n", " try:\n",
" print(\"⬇️ MIT RIR (fallback ZIP)…\")\n", " print(\"⬇️ MIT RIR (fallback ZIP)…\")\n",
" zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n", " zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
@@ -313,7 +329,8 @@
" # Normalize to 16k mono\n", " # Normalize to 16k mono\n",
" for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n", " for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
" a, sr = sf.read(p, always_2d=False)\n", " a, sr = sf.read(p, always_2d=False)\n",
" if a.ndim > 1: a = a[:,0]\n", " if a.ndim > 1:\n",
" a = a[:, 0]\n",
" if sr != 16000:\n", " if sr != 16000:\n",
" a, _ = librosa.load(p, sr=16000, mono=True)\n", " a, _ = librosa.load(p, sr=16000, mono=True)\n",
" write_wav(p, a, 16000)\n", " write_wav(p, a, 16000)\n",
@@ -323,41 +340,80 @@
"else:\n", "else:\n",
" print(\"✅ mit_rirs exists; skipping.\")\n", " print(\"✅ mit_rirs exists; skipping.\")\n",
"\n", "\n",
"# -----------------------------\n", "# ============================================================\n",
"# AudioSet (NO resample — fast)\n", "# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n",
"# -----------------------------\n", "# ============================================================\n",
"print(\"\\n=== AudioSet subset ===\")\n", "print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n",
"audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n", "audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
"audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n", "audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
"\n", "\n",
"links = [f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train0{i}.tar\"\n", "if any(audioset_out.rglob(\"*.wav\")):\n",
" for i in range(10)]\n", " print(\"✅ audioset_16k exists; skipping.\")\n",
"for link in links:\n", "else:\n",
" fname = link.split(\"/\")[-1]\n", " # commits / refs we know about — well probe them\n",
" REV_CANDIDATES = [\n",
" \"6762f044d1c88619c7f2006486036192128fb07e\",\n",
" \"0049167e89f259a010c3f070fe3666d9e5242836\",\n",
" \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n",
" \"main\", # last resort\n",
" ]\n",
" # possible folder layouts\n",
" TAR_PATTERNS = [\n",
" \"data/bal_train0{idx}.tar\",\n",
" \"data/bal_train/bal_train0{idx}.tar\",\n",
" ]\n",
"\n",
" def find_working_rev():\n",
" for rev in REV_CANDIDATES:\n",
" for pat in TAR_PATTERNS:\n",
" probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n",
" rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n",
" if rc == 0:\n",
" return rev, pat\n",
" return None, None\n",
"\n",
" rev, pattern = find_working_rev()\n",
" if rev is None:\n",
" raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n",
"\n",
" print(f\"📌 Using AudioSet revision: {rev}\")\n",
" print(f\"🗂️ Tar layout pattern: {pattern}\")\n",
"\n",
" # download + extract bal_train00..09\n",
" for i in range(10):\n",
" rel = pattern.format(idx=i)\n",
" url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n",
" fname = rel.split(\"/\")[-1]\n",
" out_tar = audioset_dir / fname\n", " out_tar = audioset_dir / fname\n",
" if not out_tar.exists():\n", " if not out_tar.exists():\n",
" print(f\"⬇️ {fname}\")\n", " print(f\"⬇️ {fname}\")\n",
" os.system(f\"wget -q -O '{out_tar}' '{link}'\")\n", " rc = curl(url, out_tar)\n",
" if rc != 0:\n",
" print(f\"⚠️ Could not fetch {fname} at rev {rev}; continuing.\")\n",
" continue\n",
" print(f\"📦 Extract {fname}\")\n", " print(f\"📦 Extract {fname}\")\n",
" os.system(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n", " rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
" if rc != 0:\n",
" print(f\"⚠️ tar extract failed for {fname}; continuing.\")\n",
"\n", "\n",
"flacs = list(audioset_dir.rglob(\"*.flac\"))\n", " # convert FLAC → 16k mono WAV\n",
"print(f\"🔎 FLAC files: {len(flacs)}\")\n", " flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
"corrupt = []\n", " print(f\"🔎 FLAC files: {len(flacs)}\")\n",
"for p in tqdm(flacs, desc=\"AudioSet→WAV (no resample)\"):\n", " audioset_bad = []\n",
" ok = 0\n",
" for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n",
" try:\n", " try:\n",
" a, sr = sf.read(p, always_2d=False)\n", " y, _ = librosa.load(p, sr=16000, mono=True)\n",
" if a is None or len(a) == 0:\n", " if y.size == 0:\n",
" raise ValueError(\"empty audio\")\n", " raise ValueError(\"empty audio\")\n",
" if a.ndim > 1:\n", " write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n",
" a = a[:,0]\n", " ok += 1\n",
" # Apple behavior: write as 16-bit and label 16 kHz (no resample)\n",
" write_wav(audioset_out / (p.stem + \".wav\"), a, 16000)\n",
" except Exception as e:\n", " except Exception as e:\n",
" corrupt.append(f\"{p}:{e}\")\n", " audioset_bad.append(f\"{p}:{e}\")\n",
"if corrupt:\n", "\n",
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n", " if audioset_bad:\n",
"print(\"✅ AudioSet processing complete!\")\n", " (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n",
" print(f\"✅ AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n",
"\n", "\n",
"# -----------------------------\n", "# -----------------------------\n",
"# FMA xsmall (resample to 16 kHz mono)\n", "# FMA xsmall (resample to 16 kHz mono)\n",
@@ -378,7 +434,7 @@
"corrupt = []\n", "corrupt = []\n",
"for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n", "for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n",
" try:\n", " try:\n",
" y, sr = librosa.load(p, sr=16000, mono=True) # proper decode+resample\n", " y, sr = librosa.load(p, sr=16000, mono=True)\n",
" if y.size == 0:\n", " if y.size == 0:\n",
" raise ValueError(\"empty audio\")\n", " raise ValueError(\"empty audio\")\n",
" write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n", " write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
@@ -386,6 +442,7 @@
" corrupt.append(f\"{p}:{e}\")\n", " corrupt.append(f\"{p}:{e}\")\n",
"if corrupt:\n", "if corrupt:\n",
" Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n", " Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
"\n",
"print(\"\\n✅ Dataset prep complete!\")" "print(\"\\n✅ Dataset prep complete!\")"
] ]
}, },