mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
fixes
thanks @ mitrokun
This commit is contained in:
@@ -32,7 +32,7 @@
|
|||||||
"# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
|
"# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# --- Set your wake word here ---\n",
|
"# --- Set your wake word here ---\n",
|
||||||
"TARGET_WORD = \"hey_tater\" # 🗣️ Change this to whatever phrase you want!\n",
|
"TARGET_WORD = \"tater\" # 🗣️ Change this to whatever phrase you want!\n",
|
||||||
"print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")"
|
"print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -56,18 +56,21 @@
|
|||||||
"# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n",
|
"# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n",
|
||||||
"!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
|
"!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Clone the microWakeWord repository\n",
|
"# Clone the microWakeWord repository (your fork)\n",
|
||||||
"repo_path = \"./microWakeWord\"\n",
|
"repo_path = \"./microWakeWord\"\n",
|
||||||
"if not os.path.exists(repo_path):\n",
|
"if not os.path.exists(repo_path):\n",
|
||||||
" print(\"Cloning microWakeWord repository...\")\n",
|
" print(\"⬇️ Cloning microWakeWord repository...\")\n",
|
||||||
" !git clone https://github.com/kahrendt/microWakeWord.git {repo_path}\n",
|
" !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Ensure the repository exists before attempting to install\n",
|
"# Optionally pin to a specific commit for reproducibility\n",
|
||||||
|
"os.system(f\"cd {repo_path} && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Install editable\n",
|
||||||
"if os.path.exists(repo_path):\n",
|
"if os.path.exists(repo_path):\n",
|
||||||
" print(\"Installing microWakeWord...\")\n",
|
" print(\"📦 Installing microWakeWord...\")\n",
|
||||||
" !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
|
" !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" print(f\"Repository not found at {repo_path}. Cloning might have failed.\")"
|
" print(f\"❌ Repository not found at {repo_path}. Clone might have failed.\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -261,18 +264,28 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# NVIDIA/Linux dataset prep to match the Apple behavior, but without datasets.Audio (no TorchCodec)\n",
|
"# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n",
|
||||||
"# MIT RIR -> resample to 16 kHz\n",
|
"# MIT RIR -> resample to 16 kHz\n",
|
||||||
"# AudioSet -> NO resample\n",
|
"# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n",
|
||||||
"# FMA -> resample to 16 kHz mono\n",
|
"# FMA -> resample to 16 kHz mono\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import os, sys, scipy.io.wavfile, numpy as np\n",
|
"import os, sys, subprocess, scipy.io.wavfile, numpy as np\n",
|
||||||
"from pathlib import Path\n",
|
"from pathlib import Path\n",
|
||||||
"from tqdm import tqdm\n",
|
"from tqdm import tqdm\n",
|
||||||
"import soundfile as sf\n",
|
"import soundfile as sf\n",
|
||||||
"import librosa\n",
|
"import librosa\n",
|
||||||
"from datasets import load_dataset\n",
|
"from datasets import load_dataset\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# -------------------------------------------------\n",
|
||||||
|
"# small shell helpers (for curl/tar probing)\n",
|
||||||
|
"# -------------------------------------------------\n",
|
||||||
|
"def sh(cmd: str) -> int:\n",
|
||||||
|
" return subprocess.call(cmd, shell=True)\n",
|
||||||
|
"\n",
|
||||||
|
"def curl(url: str, out: Path) -> int:\n",
|
||||||
|
" # -L follow, -s silent, --fail to get nonzero on 404\n",
|
||||||
|
" return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n",
|
||||||
|
"\n",
|
||||||
"def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
|
"def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
|
||||||
" x = np.clip(data, -1.0, 1.0)\n",
|
" x = np.clip(data, -1.0, 1.0)\n",
|
||||||
" scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
|
" scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
|
||||||
@@ -287,10 +300,13 @@
|
|||||||
" ok = 0\n",
|
" ok = 0\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" # Avoid datasets.Audio to keep TorchCodec out:\n",
|
" # Avoid datasets.Audio to keep TorchCodec out:\n",
|
||||||
" # Use streaming=True + Audio(decode=False)-equivalent: access raw file path and decode with librosa\n",
|
" # Use streaming=True + manual decode with librosa\n",
|
||||||
" print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n",
|
" print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n",
|
||||||
" ds = load_dataset(\"davidscripka/MIT_environmental_impulse_responses\",\n",
|
" ds = load_dataset(\n",
|
||||||
" split=\"train\", streaming=True)\n",
|
" \"davidscripka/MIT_environmental_impulse_responses\",\n",
|
||||||
|
" split=\"train\",\n",
|
||||||
|
" streaming=True\n",
|
||||||
|
" )\n",
|
||||||
" for i, row in enumerate(tqdm(ds)):\n",
|
" for i, row in enumerate(tqdm(ds)):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" audio_path = row[\"audio\"][\"path\"]\n",
|
" audio_path = row[\"audio\"][\"path\"]\n",
|
||||||
@@ -302,7 +318,7 @@
|
|||||||
" print(f\"✅ MIT RIR saved: {ok} files\")\n",
|
" print(f\"✅ MIT RIR saved: {ok} files\")\n",
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" print(f\"⚠️ MIT RIR download failed: {e}\")\n",
|
" print(f\"⚠️ MIT RIR download failed: {e}\")\n",
|
||||||
" # Fallback to official ZIP if needed (rare)\n",
|
" # Fallback ZIP route\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" print(\"⬇️ MIT RIR (fallback ZIP)…\")\n",
|
" print(\"⬇️ MIT RIR (fallback ZIP)…\")\n",
|
||||||
" zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
|
" zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
|
||||||
@@ -313,7 +329,8 @@
|
|||||||
" # Normalize to 16k mono\n",
|
" # Normalize to 16k mono\n",
|
||||||
" for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
|
" for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
|
||||||
" a, sr = sf.read(p, always_2d=False)\n",
|
" a, sr = sf.read(p, always_2d=False)\n",
|
||||||
" if a.ndim > 1: a = a[:,0]\n",
|
" if a.ndim > 1:\n",
|
||||||
|
" a = a[:, 0]\n",
|
||||||
" if sr != 16000:\n",
|
" if sr != 16000:\n",
|
||||||
" a, _ = librosa.load(p, sr=16000, mono=True)\n",
|
" a, _ = librosa.load(p, sr=16000, mono=True)\n",
|
||||||
" write_wav(p, a, 16000)\n",
|
" write_wav(p, a, 16000)\n",
|
||||||
@@ -323,41 +340,80 @@
|
|||||||
"else:\n",
|
"else:\n",
|
||||||
" print(\"✅ mit_rirs exists; skipping.\")\n",
|
" print(\"✅ mit_rirs exists; skipping.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# -----------------------------\n",
|
"# ============================================================\n",
|
||||||
"# AudioSet (NO resample — fast)\n",
|
"# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n",
|
||||||
"# -----------------------------\n",
|
"# ============================================================\n",
|
||||||
"print(\"\\n=== AudioSet subset ===\")\n",
|
"print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n",
|
||||||
"audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
|
"audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
|
||||||
"audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
|
"audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"links = [f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train0{i}.tar\"\n",
|
"if any(audioset_out.rglob(\"*.wav\")):\n",
|
||||||
" for i in range(10)]\n",
|
" print(\"✅ audioset_16k exists; skipping.\")\n",
|
||||||
"for link in links:\n",
|
"else:\n",
|
||||||
" fname = link.split(\"/\")[-1]\n",
|
" # commits / refs we know about — we’ll probe them\n",
|
||||||
|
" REV_CANDIDATES = [\n",
|
||||||
|
" \"6762f044d1c88619c7f2006486036192128fb07e\",\n",
|
||||||
|
" \"0049167e89f259a010c3f070fe3666d9e5242836\",\n",
|
||||||
|
" \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n",
|
||||||
|
" \"main\", # last resort\n",
|
||||||
|
" ]\n",
|
||||||
|
" # possible folder layouts\n",
|
||||||
|
" TAR_PATTERNS = [\n",
|
||||||
|
" \"data/bal_train0{idx}.tar\",\n",
|
||||||
|
" \"data/bal_train/bal_train0{idx}.tar\",\n",
|
||||||
|
" ]\n",
|
||||||
|
"\n",
|
||||||
|
" def find_working_rev():\n",
|
||||||
|
" for rev in REV_CANDIDATES:\n",
|
||||||
|
" for pat in TAR_PATTERNS:\n",
|
||||||
|
" probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n",
|
||||||
|
" rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n",
|
||||||
|
" if rc == 0:\n",
|
||||||
|
" return rev, pat\n",
|
||||||
|
" return None, None\n",
|
||||||
|
"\n",
|
||||||
|
" rev, pattern = find_working_rev()\n",
|
||||||
|
" if rev is None:\n",
|
||||||
|
" raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"📌 Using AudioSet revision: {rev}\")\n",
|
||||||
|
" print(f\"🗂️ Tar layout pattern: {pattern}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # download + extract bal_train00..09\n",
|
||||||
|
" for i in range(10):\n",
|
||||||
|
" rel = pattern.format(idx=i)\n",
|
||||||
|
" url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n",
|
||||||
|
" fname = rel.split(\"/\")[-1]\n",
|
||||||
" out_tar = audioset_dir / fname\n",
|
" out_tar = audioset_dir / fname\n",
|
||||||
" if not out_tar.exists():\n",
|
" if not out_tar.exists():\n",
|
||||||
" print(f\"⬇️ {fname}\")\n",
|
" print(f\"⬇️ {fname}\")\n",
|
||||||
" os.system(f\"wget -q -O '{out_tar}' '{link}'\")\n",
|
" rc = curl(url, out_tar)\n",
|
||||||
|
" if rc != 0:\n",
|
||||||
|
" print(f\"⚠️ Could not fetch {fname} at rev {rev}; continuing.\")\n",
|
||||||
|
" continue\n",
|
||||||
" print(f\"📦 Extract {fname}\")\n",
|
" print(f\"📦 Extract {fname}\")\n",
|
||||||
" os.system(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
|
" rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
|
||||||
|
" if rc != 0:\n",
|
||||||
|
" print(f\"⚠️ tar extract failed for {fname}; continuing.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
" # convert FLAC → 16k mono WAV\n",
|
||||||
" flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
|
" flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
|
||||||
" print(f\"🔎 FLAC files: {len(flacs)}\")\n",
|
" print(f\"🔎 FLAC files: {len(flacs)}\")\n",
|
||||||
"corrupt = []\n",
|
" audioset_bad = []\n",
|
||||||
"for p in tqdm(flacs, desc=\"AudioSet→WAV (no resample)\"):\n",
|
" ok = 0\n",
|
||||||
|
" for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" a, sr = sf.read(p, always_2d=False)\n",
|
" y, _ = librosa.load(p, sr=16000, mono=True)\n",
|
||||||
" if a is None or len(a) == 0:\n",
|
" if y.size == 0:\n",
|
||||||
" raise ValueError(\"empty audio\")\n",
|
" raise ValueError(\"empty audio\")\n",
|
||||||
" if a.ndim > 1:\n",
|
" write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n",
|
||||||
" a = a[:,0]\n",
|
" ok += 1\n",
|
||||||
" # Apple behavior: write as 16-bit and label 16 kHz (no resample)\n",
|
|
||||||
" write_wav(audioset_out / (p.stem + \".wav\"), a, 16000)\n",
|
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" corrupt.append(f\"{p}:{e}\")\n",
|
" audioset_bad.append(f\"{p}:{e}\")\n",
|
||||||
"if corrupt:\n",
|
"\n",
|
||||||
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
|
" if audioset_bad:\n",
|
||||||
"print(\"✅ AudioSet processing complete!\")\n",
|
" (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n",
|
||||||
|
" print(f\"✅ AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# -----------------------------\n",
|
"# -----------------------------\n",
|
||||||
"# FMA xsmall (resample to 16 kHz mono)\n",
|
"# FMA xsmall (resample to 16 kHz mono)\n",
|
||||||
@@ -378,7 +434,7 @@
|
|||||||
"corrupt = []\n",
|
"corrupt = []\n",
|
||||||
"for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n",
|
"for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" y, sr = librosa.load(p, sr=16000, mono=True) # proper decode+resample\n",
|
" y, sr = librosa.load(p, sr=16000, mono=True)\n",
|
||||||
" if y.size == 0:\n",
|
" if y.size == 0:\n",
|
||||||
" raise ValueError(\"empty audio\")\n",
|
" raise ValueError(\"empty audio\")\n",
|
||||||
" write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
|
" write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
|
||||||
@@ -386,6 +442,7 @@
|
|||||||
" corrupt.append(f\"{p}:{e}\")\n",
|
" corrupt.append(f\"{p}:{e}\")\n",
|
||||||
"if corrupt:\n",
|
"if corrupt:\n",
|
||||||
" Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
|
" Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
|
||||||
|
"\n",
|
||||||
"print(\"\\n✅ Dataset prep complete!\")"
|
"print(\"\\n✅ Dataset prep complete!\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user