diff --git a/advanced_training_notebook.ipynb b/advanced_training_notebook.ipynb index 66c9c92..51c3046 100644 --- a/advanced_training_notebook.ipynb +++ b/advanced_training_notebook.ipynb @@ -136,7 +136,7 @@ "id": "YJRG4Qvo9nXG" }, "outputs": [], - "source": [ + "source": [ "# Downloads audio data for augmentation. This can be slow!\n", "# Borrowed from openWakeWord's automatic_model_training.ipynb, accessed March 4, 2024\n", "#\n", @@ -205,8 +205,8 @@ " audioset_dataset = audioset_dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n", "\n", " corrupted_files = []\n", - " print(\"Converting Audioset files to 16kHz WAV...\")\n", "\n", + " print(\"Converting Audioset files to 16kHz WAV...\")\n", " for row in tqdm(audioset_dataset, desc=\"Processing Audioset files\"):\n", " try:\n", " # Define output file name and path\n", @@ -215,6 +215,8 @@ " \n", " # Read and convert audio\n", " data = row[\"audio\"][\"array\"]\n", + " if data is None or len(data) == 0:\n", + " raise ValueError(f\"Empty audio data in file: {row['audio']['path']}\")\n", " scipy.io.wavfile.write(output_path, 16000, (data * 32767).astype(np.int16))\n", " \n", " except Exception as e:\n",