diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb index e3d2d56..9c36327 100644 --- a/microWakeWord_training_notebook.ipynb +++ b/microWakeWord_training_notebook.ipynb @@ -32,7 +32,7 @@ "# 🔗 https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n", "\n", "# --- Set your wake word here ---\n", - "TARGET_WORD = \"hey_tater\" # 🗣️ Change this to whatever phrase you want!\n", + "TARGET_WORD = \"tater\" # 🗣️ Change this to whatever phrase you want!\n", "print(f\"🥔 Tater Totterson is listening for: '{TARGET_WORD}'\")" ] }, @@ -56,18 +56,21 @@ "# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter\n", "!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n", "\n", - "# Clone the microWakeWord repository\n", + "# Clone the microWakeWord repository (your fork)\n", "repo_path = \"./microWakeWord\"\n", "if not os.path.exists(repo_path):\n", - " print(\"Cloning microWakeWord repository...\")\n", - " !git clone https://github.com/kahrendt/microWakeWord.git {repo_path}\n", + " print(\"⬇️ Cloning microWakeWord repository...\")\n", + " !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n", "\n", - "# Ensure the repository exists before attempting to install\n", + "# Optionally pin to a specific commit for reproducibility\n", + "os.system(f\"cd {repo_path} && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\")\n", + "\n", + "# Install editable\n", "if os.path.exists(repo_path):\n", - " print(\"Installing microWakeWord...\")\n", + " print(\"📦 Installing microWakeWord...\")\n", " !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n", "else:\n", - " print(f\"Repository not found at {repo_path}. Cloning might have failed.\")" + " print(f\"❌ Repository not found at {repo_path}. Clone might have failed.\")" ] }, { @@ -261,18 +264,28 @@ }, "outputs": [], "source": [ - "# NVIDIA/Linux dataset prep to match the Apple behavior, but without datasets.Audio (no TorchCodec)\n", + "# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n", "# MIT RIR -> resample to 16 kHz\n", - "# AudioSet -> NO resample\n", + "# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n", "# FMA -> resample to 16 kHz mono\n", "\n", - "import os, sys, scipy.io.wavfile, numpy as np\n", + "import os, sys, subprocess, scipy.io.wavfile, numpy as np\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "import soundfile as sf\n", "import librosa\n", "from datasets import load_dataset\n", "\n", + "# -------------------------------------------------\n", + "# small shell helpers (for curl/tar probing)\n", + "# -------------------------------------------------\n", + "def sh(cmd: str) -> int:\n", + " return subprocess.call(cmd, shell=True)\n", + "\n", + "def curl(url: str, out: Path) -> int:\n", + " # -L follow, -s silent, --fail to get nonzero on 404\n", + " return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n", + "\n", "def write_wav(dst: Path, data: np.ndarray, sr: int):\n", " x = np.clip(data, -1.0, 1.0)\n", " scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n", @@ -287,10 +300,13 @@ " ok = 0\n", " try:\n", " # Avoid datasets.Audio to keep TorchCodec out:\n", - " # Use streaming=True + Audio(decode=False)-equivalent: access raw file path and decode with librosa\n", + " # Use streaming=True + manual decode with librosa\n", " print(\"⬇️ MIT RIR (streaming + manual decode)…\")\n", - " ds = load_dataset(\"davidscripka/MIT_environmental_impulse_responses\",\n", - " split=\"train\", streaming=True)\n", + " ds = load_dataset(\n", + " \"davidscripka/MIT_environmental_impulse_responses\",\n", + " split=\"train\",\n", + " streaming=True\n", + " )\n", " for i, row in enumerate(tqdm(ds)):\n", " try:\n", " audio_path = row[\"audio\"][\"path\"]\n", @@ -302,7 +318,7 @@ " print(f\"✅ MIT RIR saved: {ok} files\")\n", " except Exception as e:\n", " print(f\"⚠️ MIT RIR download failed: {e}\")\n", - " # Fallback to official ZIP if needed (rare)\n", + " # Fallback ZIP route\n", " try:\n", " print(\"⬇️ MIT RIR (fallback ZIP)…\")\n", " zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n", @@ -313,7 +329,8 @@ " # Normalize to 16k mono\n", " for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n", " a, sr = sf.read(p, always_2d=False)\n", - " if a.ndim > 1: a = a[:,0]\n", + " if a.ndim > 1:\n", + " a = a[:, 0]\n", " if sr != 16000:\n", " a, _ = librosa.load(p, sr=16000, mono=True)\n", " write_wav(p, a, 16000)\n", @@ -323,41 +340,80 @@ "else:\n", " print(\"✅ mit_rirs exists; skipping.\")\n", "\n", - "# -----------------------------\n", - "# AudioSet (NO resample — fast)\n", - "# -----------------------------\n", - "print(\"\\n=== AudioSet subset ===\")\n", + "# ============================================================\n", + "# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n", + "# ============================================================\n", + "print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n", "audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n", "audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n", "\n", - "links = [f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train0{i}.tar\"\n", - " for i in range(10)]\n", - "for link in links:\n", - " fname = link.split(\"/\")[-1]\n", - " out_tar = audioset_dir / fname\n", - " if not out_tar.exists():\n", - " print(f\"⬇️ {fname}\")\n", - " os.system(f\"wget -q -O '{out_tar}' '{link}'\")\n", - " print(f\"📦 Extract {fname}\")\n", - " os.system(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n", + "if any(audioset_out.rglob(\"*.wav\")):\n", + " print(\"✅ audioset_16k exists; skipping.\")\n", + "else:\n", + " # commits / refs we know about — we’ll probe them\n", + " REV_CANDIDATES = [\n", + " \"6762f044d1c88619c7f2006486036192128fb07e\",\n", + " \"0049167e89f259a010c3f070fe3666d9e5242836\",\n", + " \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n", + " \"main\", # last resort\n", + " ]\n", + " # possible folder layouts\n", + " TAR_PATTERNS = [\n", + " \"data/bal_train0{idx}.tar\",\n", + " \"data/bal_train/bal_train0{idx}.tar\",\n", + " ]\n", "\n", - "flacs = list(audioset_dir.rglob(\"*.flac\"))\n", - "print(f\"🔎 FLAC files: {len(flacs)}\")\n", - "corrupt = []\n", - "for p in tqdm(flacs, desc=\"AudioSet→WAV (no resample)\"):\n", - " try:\n", - " a, sr = sf.read(p, always_2d=False)\n", - " if a is None or len(a) == 0:\n", - " raise ValueError(\"empty audio\")\n", - " if a.ndim > 1:\n", - " a = a[:,0]\n", - " # Apple behavior: write as 16-bit and label 16 kHz (no resample)\n", - " write_wav(audioset_out / (p.stem + \".wav\"), a, 16000)\n", - " except Exception as e:\n", - " corrupt.append(f\"{p}:{e}\")\n", - "if corrupt:\n", - " (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n", - "print(\"✅ AudioSet processing complete!\")\n", + " def find_working_rev():\n", + " for rev in REV_CANDIDATES:\n", + " for pat in TAR_PATTERNS:\n", + " probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n", + " rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n", + " if rc == 0:\n", + " return rev, pat\n", + " return None, None\n", + "\n", + " rev, pattern = find_working_rev()\n", + " if rev is None:\n", + " raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n", + "\n", + " print(f\"📌 Using AudioSet revision: {rev}\")\n", + " print(f\"🗂️ Tar layout pattern: {pattern}\")\n", + "\n", + " # download + extract bal_train00..09\n", + " for i in range(10):\n", + " rel = pattern.format(idx=i)\n", + " url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n", + " fname = rel.split(\"/\")[-1]\n", + " out_tar = audioset_dir / fname\n", + " if not out_tar.exists():\n", + " print(f\"⬇️ {fname}\")\n", + " rc = curl(url, out_tar)\n", + " if rc != 0:\n", + " print(f\"⚠️ Could not fetch {fname} at rev {rev}; continuing.\")\n", + " continue\n", + " print(f\"📦 Extract {fname}\")\n", + " rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n", + " if rc != 0:\n", + " print(f\"⚠️ tar extract failed for {fname}; continuing.\")\n", + "\n", + " # convert FLAC → 16k mono WAV\n", + " flacs = list(audioset_dir.rglob(\"*.flac\"))\n", + " print(f\"🔎 FLAC files: {len(flacs)}\")\n", + " audioset_bad = []\n", + " ok = 0\n", + " for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n", + " try:\n", + " y, _ = librosa.load(p, sr=16000, mono=True)\n", + " if y.size == 0:\n", + " raise ValueError(\"empty audio\")\n", + " write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n", + " ok += 1\n", + " except Exception as e:\n", + " audioset_bad.append(f\"{p}:{e}\")\n", + "\n", + " if audioset_bad:\n", + " (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n", + " print(f\"✅ AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n", "\n", "# -----------------------------\n", "# FMA xsmall (resample to 16 kHz mono)\n", @@ -378,7 +434,7 @@ "corrupt = []\n", "for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n", " try:\n", - " y, sr = librosa.load(p, sr=16000, mono=True) # proper decode+resample\n", + " y, sr = librosa.load(p, sr=16000, mono=True)\n", " if y.size == 0:\n", " raise ValueError(\"empty audio\")\n", " write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n", @@ -386,6 +442,7 @@ " corrupt.append(f\"{p}:{e}\")\n", "if corrupt:\n", " Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n", + "\n", "print(\"\\n✅ Dataset prep complete!\")" ] },