diff --git a/advanced_training_notebook.ipynb b/advanced_training_notebook.ipynb index fbf49c3..f598fda 100644 --- a/advanced_training_notebook.ipynb +++ b/advanced_training_notebook.ipynb @@ -202,31 +202,28 @@ "print(f\"Number of FLAC files found: {len(audioset_files)}\")\n", "\n", "if audioset_files:\n", - " audioset_dataset = Dataset.from_dict({\"audio\": [str(file) for file in audioset_files]})\n", - " audioset_dataset = audioset_dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n", - "\n", " corrupted_files = []\n", "\n", " print(\"Converting Audioset files to 16kHz WAV...\")\n", - " for row in tqdm(audioset_dataset, desc=\"Processing Audioset files\"):\n", + " for file_path in tqdm(audioset_files, desc=\"Processing Audioset files\"):\n", " try:\n", - " # Define output file name and path\n", - " name = Path(row[\"audio\"][\"path\"]).stem + \".wav\"\n", - " output_path = Path(output_dir) / name\n", - "\n", - " # Check if audio data is valid before writing\n", - " if row[\"audio\"][\"array\"] is None or len(row[\"audio\"][\"array\"]) == 0:\n", - " raise ValueError(f\"Empty or invalid audio data in file: {row['audio']['path']}\")\n", + " # Attempt to load the file and handle any errors\n", + " audio, sampling_rate = sf.read(file_path)\n", + " \n", + " if audio is None or len(audio) == 0:\n", + " raise ValueError(f\"Empty or invalid audio data in file: {file_path}\")\n", "\n", + " # Resample audio to 16kHz\n", + " output_path = Path(output_dir) / (file_path.stem + \".wav\")\n", " scipy.io.wavfile.write(\n", " output_path,\n", " 16000,\n", - " (row[\"audio\"][\"array\"] * 32767).astype(np.int16),\n", + " (audio * 32767).astype(np.int16),\n", " )\n", " except (sf.LibsndfileError, ValueError, Exception) as e:\n", " # Log the error and skip the file\n", - " print(f\"Error converting {row['audio']['path']}: {e}\")\n", - " corrupted_files.append(row[\"audio\"][\"path\"])\n", + " print(f\"Error converting {file_path}: {e}\")\n", + " corrupted_files.append(str(file_path))\n", "\n", " # Log corrupted files\n", " if corrupted_files:\n",