mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
fixes
thanks @ mitrokun
This commit is contained in:
@@ -32,7 +32,7 @@
|
||||
"# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
|
||||
"\n",
|
||||
"# --- Set your wake word here ---\n",
|
||||
"TARGET_WORD = \"hey_tater\" # 🗣️ Change this to whatever phrase you want!\n",
|
||||
"TARGET_WORD = \"tater\" # 🗣️ Change this to whatever phrase you want!\n",
|
||||
"print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")"
|
||||
]
|
||||
},
|
||||
@@ -56,18 +56,21 @@
|
||||
"# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n",
|
||||
"!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
|
||||
"\n",
|
||||
"# Clone the microWakeWord repository\n",
|
||||
"# Clone the microWakeWord repository (your fork)\n",
|
||||
"repo_path = \"./microWakeWord\"\n",
|
||||
"if not os.path.exists(repo_path):\n",
|
||||
" print(\"Cloning microWakeWord repository...\")\n",
|
||||
" !git clone https://github.com/kahrendt/microWakeWord.git {repo_path}\n",
|
||||
" print(\"⬇️ Cloning microWakeWord repository...\")\n",
|
||||
" !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n",
|
||||
"\n",
|
||||
"# Ensure the repository exists before attempting to install\n",
|
||||
"# Optionally pin to a specific commit for reproducibility\n",
|
||||
"os.system(f\"cd {repo_path} && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\")\n",
|
||||
"\n",
|
||||
"# Install editable\n",
|
||||
"if os.path.exists(repo_path):\n",
|
||||
" print(\"Installing microWakeWord...\")\n",
|
||||
" print(\"📦 Installing microWakeWord...\")\n",
|
||||
" !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
|
||||
"else:\n",
|
||||
" print(f\"Repository not found at {repo_path}. Cloning might have failed.\")"
|
||||
" print(f\"❌ Repository not found at {repo_path}. Clone might have failed.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -261,18 +264,28 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# NVIDIA/Linux dataset prep to match the Apple behavior, but without datasets.Audio (no TorchCodec)\n",
|
||||
"# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n",
|
||||
"# MIT RIR -> resample to 16 kHz\n",
|
||||
"# AudioSet -> NO resample\n",
|
||||
"# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n",
|
||||
"# FMA -> resample to 16 kHz mono\n",
|
||||
"\n",
|
||||
"import os, sys, scipy.io.wavfile, numpy as np\n",
|
||||
"import os, sys, subprocess, scipy.io.wavfile, numpy as np\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import soundfile as sf\n",
|
||||
"import librosa\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"\n",
|
||||
"# -------------------------------------------------\n",
|
||||
"# small shell helpers (for curl/tar probing)\n",
|
||||
"# -------------------------------------------------\n",
|
||||
"def sh(cmd: str) -> int:\n",
|
||||
" return subprocess.call(cmd, shell=True)\n",
|
||||
"\n",
|
||||
"def curl(url: str, out: Path) -> int:\n",
|
||||
" # -L follow, -s silent, --fail to get nonzero on 404\n",
|
||||
" return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n",
|
||||
"\n",
|
||||
"def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
|
||||
" x = np.clip(data, -1.0, 1.0)\n",
|
||||
" scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
|
||||
@@ -287,10 +300,13 @@
|
||||
" ok = 0\n",
|
||||
" try:\n",
|
||||
" # Avoid datasets.Audio to keep TorchCodec out:\n",
|
||||
" # Use streaming=True + Audio(decode=False)-equivalent: access raw file path and decode with librosa\n",
|
||||
" # Use streaming=True + manual decode with librosa\n",
|
||||
" print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n",
|
||||
" ds = load_dataset(\"davidscripka/MIT_environmental_impulse_responses\",\n",
|
||||
" split=\"train\", streaming=True)\n",
|
||||
" ds = load_dataset(\n",
|
||||
" \"davidscripka/MIT_environmental_impulse_responses\",\n",
|
||||
" split=\"train\",\n",
|
||||
" streaming=True\n",
|
||||
" )\n",
|
||||
" for i, row in enumerate(tqdm(ds)):\n",
|
||||
" try:\n",
|
||||
" audio_path = row[\"audio\"][\"path\"]\n",
|
||||
@@ -302,7 +318,7 @@
|
||||
" print(f\"✅ MIT RIR saved: {ok} files\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"⚠️ MIT RIR download failed: {e}\")\n",
|
||||
" # Fallback to official ZIP if needed (rare)\n",
|
||||
" # Fallback ZIP route\n",
|
||||
" try:\n",
|
||||
" print(\"⬇️ MIT RIR (fallback ZIP)…\")\n",
|
||||
" zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
|
||||
@@ -313,7 +329,8 @@
|
||||
" # Normalize to 16k mono\n",
|
||||
" for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
|
||||
" a, sr = sf.read(p, always_2d=False)\n",
|
||||
" if a.ndim > 1: a = a[:,0]\n",
|
||||
" if a.ndim > 1:\n",
|
||||
" a = a[:, 0]\n",
|
||||
" if sr != 16000:\n",
|
||||
" a, _ = librosa.load(p, sr=16000, mono=True)\n",
|
||||
" write_wav(p, a, 16000)\n",
|
||||
@@ -323,41 +340,80 @@
|
||||
"else:\n",
|
||||
" print(\"✅ mit_rirs exists; skipping.\")\n",
|
||||
"\n",
|
||||
"# -----------------------------\n",
|
||||
"# AudioSet (NO resample — fast)\n",
|
||||
"# -----------------------------\n",
|
||||
"print(\"\\n=== AudioSet subset ===\")\n",
|
||||
"# ============================================================\n",
|
||||
"# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n",
|
||||
"# ============================================================\n",
|
||||
"print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n",
|
||||
"audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
|
||||
"audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
|
||||
"\n",
|
||||
"links = [f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train0{i}.tar\"\n",
|
||||
" for i in range(10)]\n",
|
||||
"for link in links:\n",
|
||||
" fname = link.split(\"/\")[-1]\n",
|
||||
" out_tar = audioset_dir / fname\n",
|
||||
" if not out_tar.exists():\n",
|
||||
" print(f\"⬇️ {fname}\")\n",
|
||||
" os.system(f\"wget -q -O '{out_tar}' '{link}'\")\n",
|
||||
" print(f\"📦 Extract {fname}\")\n",
|
||||
" os.system(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
|
||||
"if any(audioset_out.rglob(\"*.wav\")):\n",
|
||||
" print(\"✅ audioset_16k exists; skipping.\")\n",
|
||||
"else:\n",
|
||||
" # commits / refs we know about — we’ll probe them\n",
|
||||
" REV_CANDIDATES = [\n",
|
||||
" \"6762f044d1c88619c7f2006486036192128fb07e\",\n",
|
||||
" \"0049167e89f259a010c3f070fe3666d9e5242836\",\n",
|
||||
" \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n",
|
||||
" \"main\", # last resort\n",
|
||||
" ]\n",
|
||||
" # possible folder layouts\n",
|
||||
" TAR_PATTERNS = [\n",
|
||||
" \"data/bal_train0{idx}.tar\",\n",
|
||||
" \"data/bal_train/bal_train0{idx}.tar\",\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
|
||||
"print(f\"🔎 FLAC files: {len(flacs)}\")\n",
|
||||
"corrupt = []\n",
|
||||
"for p in tqdm(flacs, desc=\"AudioSet→WAV (no resample)\"):\n",
|
||||
" try:\n",
|
||||
" a, sr = sf.read(p, always_2d=False)\n",
|
||||
" if a is None or len(a) == 0:\n",
|
||||
" raise ValueError(\"empty audio\")\n",
|
||||
" if a.ndim > 1:\n",
|
||||
" a = a[:,0]\n",
|
||||
" # Apple behavior: write as 16-bit and label 16 kHz (no resample)\n",
|
||||
" write_wav(audioset_out / (p.stem + \".wav\"), a, 16000)\n",
|
||||
" except Exception as e:\n",
|
||||
" corrupt.append(f\"{p}:{e}\")\n",
|
||||
"if corrupt:\n",
|
||||
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
|
||||
"print(\"✅ AudioSet processing complete!\")\n",
|
||||
" def find_working_rev():\n",
|
||||
" for rev in REV_CANDIDATES:\n",
|
||||
" for pat in TAR_PATTERNS:\n",
|
||||
" probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n",
|
||||
" rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n",
|
||||
" if rc == 0:\n",
|
||||
" return rev, pat\n",
|
||||
" return None, None\n",
|
||||
"\n",
|
||||
" rev, pattern = find_working_rev()\n",
|
||||
" if rev is None:\n",
|
||||
" raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n",
|
||||
"\n",
|
||||
" print(f\"📌 Using AudioSet revision: {rev}\")\n",
|
||||
" print(f\"🗂️ Tar layout pattern: {pattern}\")\n",
|
||||
"\n",
|
||||
" # download + extract bal_train00..09\n",
|
||||
" for i in range(10):\n",
|
||||
" rel = pattern.format(idx=i)\n",
|
||||
" url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n",
|
||||
" fname = rel.split(\"/\")[-1]\n",
|
||||
" out_tar = audioset_dir / fname\n",
|
||||
" if not out_tar.exists():\n",
|
||||
" print(f\"⬇️ {fname}\")\n",
|
||||
" rc = curl(url, out_tar)\n",
|
||||
" if rc != 0:\n",
|
||||
" print(f\"⚠️ Could not fetch {fname} at rev {rev}; continuing.\")\n",
|
||||
" continue\n",
|
||||
" print(f\"📦 Extract {fname}\")\n",
|
||||
" rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
|
||||
" if rc != 0:\n",
|
||||
" print(f\"⚠️ tar extract failed for {fname}; continuing.\")\n",
|
||||
"\n",
|
||||
" # convert FLAC → 16k mono WAV\n",
|
||||
" flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
|
||||
" print(f\"🔎 FLAC files: {len(flacs)}\")\n",
|
||||
" audioset_bad = []\n",
|
||||
" ok = 0\n",
|
||||
" for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n",
|
||||
" try:\n",
|
||||
" y, _ = librosa.load(p, sr=16000, mono=True)\n",
|
||||
" if y.size == 0:\n",
|
||||
" raise ValueError(\"empty audio\")\n",
|
||||
" write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n",
|
||||
" ok += 1\n",
|
||||
" except Exception as e:\n",
|
||||
" audioset_bad.append(f\"{p}:{e}\")\n",
|
||||
"\n",
|
||||
" if audioset_bad:\n",
|
||||
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n",
|
||||
" print(f\"✅ AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n",
|
||||
"\n",
|
||||
"# -----------------------------\n",
|
||||
"# FMA xsmall (resample to 16 kHz mono)\n",
|
||||
@@ -378,7 +434,7 @@
|
||||
"corrupt = []\n",
|
||||
"for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n",
|
||||
" try:\n",
|
||||
" y, sr = librosa.load(p, sr=16000, mono=True) # proper decode+resample\n",
|
||||
" y, sr = librosa.load(p, sr=16000, mono=True)\n",
|
||||
" if y.size == 0:\n",
|
||||
" raise ValueError(\"empty audio\")\n",
|
||||
" write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
|
||||
@@ -386,6 +442,7 @@
|
||||
" corrupt.append(f\"{p}:{e}\")\n",
|
||||
"if corrupt:\n",
|
||||
" Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
|
||||
"\n",
|
||||
"print(\"\\n✅ Dataset prep complete!\")"
|
||||
]
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user