diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb index 7470a11..210b643 100644 --- a/microWakeWord_training_notebook.ipynb +++ b/microWakeWord_training_notebook.ipynb @@ -852,61 +852,107 @@ }, "outputs": [], "source": [ - "# Train + export (GPU-friendly env + stable flags)\n", + "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n", + "import os, sys, subprocess, textwrap\n", "\n", - "import os, sys, gc, runpy\n", + "# ---- Common TF env (applies to BOTH attempts) ----\n", + "base_env = os.environ.copy()\n", + "base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n", + "base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable mem)\n", + "base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n", "\n", - "if \"tensorflow\" not in sys.modules:\n", - " os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n", - " os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n", - " os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n", - " os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n", - " os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n", - "# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n", - "# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n", + "# These only matter when a GPU is visible:\n", + "base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n", + "base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n", + "# Optional (uncomment if you want a smaller cuDNN workspace):\n", + "# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n", "\n", - "import tensorflow as tf\n", + "# ---- Training argv (same as your runpy args) ----\n", + "train_args = [\n", + " \"-m\", \"microwakeword.model_train_eval\",\n", + " \"--training_config\", \"training_parameters.yaml\",\n", + " \"--train\", \"1\",\n", + " \"--restore_checkpoint\", \"1\",\n", + " \"--test_tf_nonstreaming\", \"0\",\n", + " \"--test_tflite_nonstreaming\", \"0\",\n", + " \"--test_tflite_nonstreaming_quantized\", \"0\",\n", + " \"--test_tflite_streaming\", \"0\",\n", + " \"--test_tflite_streaming_quantized\", \"1\",\n", + " \"--use_weights\", \"best_weights\",\n", + " \"mixednet\",\n", + " \"--pointwise_filters\", \"64,64,64,64\",\n", + " \"--repeat_in_block\", \"1,1,1,1\",\n", + " \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n", + " \"--residual_connection\", \"0,0,0,0\",\n", + " \"--first_conv_filters\", \"32\",\n", + " \"--first_conv_kernel_size\", \"5\",\n", + " \"--stride\", \"2\",\n", + "]\n", "\n", - "allow_growth = \"\"\n", - "# Per-device memory growth (belt + suspenders)\n", - "for g in tf.config.list_physical_devices(\"GPU\"):\n", - " try:\n", - " tf.config.experimental.set_memory_growth(g, True)\n", - " allow_growth = \"gpu_allow_growth, \"\n", - " except Exception:\n", - " pass\n", - "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n", - "gc.collect()\n", + "OOM_MARKERS = (\n", + " \"resourceexhaustederror\",\n", + " \"resource exhausted\",\n", + " \"oom\",\n", + " \"out of memory\",\n", + " \"cuda_error_out_of_memory\",\n", + " \"cudnn\",\n", + " \"failed to allocate\",\n", + " \"blas xgemm\",\n", + " \"cublas\",\n", + " \"internalerror: cuda\",\n", + " \"failed call to cuinit\",\n", + ")\n", "\n", - "print(f\"āœ… Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n", - "print(\" Starting training...\")\n", + "def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n", + " env = base_env.copy()\n", + " env.update(extra_env or {})\n", + " print(f\"\\nšŸš€ {label}\")\n", + " print(\"→\", \" \".join([sys.executable] + train_args))\n", + " cp = subprocess.run(\n", + " [sys.executable] + train_args,\n", + " env=env,\n", + " text=True,\n", + " stdout=subprocess.PIPE,\n", + " stderr=subprocess.STDOUT,\n", + " )\n", + " print(cp.stdout)\n", + " return cp\n", "\n", - "original_argv = list(sys.argv)\n", - "try:\n", - " sys.argv = [\n", - " 'model_train_eval.py',\n", - " '--training_config', 'training_parameters.yaml',\n", - " '--train', '1',\n", - " '--restore_checkpoint', '1',\n", - " '--test_tf_nonstreaming', '0',\n", - " '--test_tflite_nonstreaming', '0',\n", - " '--test_tflite_nonstreaming_quantized', '0',\n", - " '--test_tflite_streaming', '0',\n", - " '--test_tflite_streaming_quantized', '1',\n", - " '--use_weights', 'best_weights',\n", - " 'mixednet',\n", - " '--pointwise_filters', '64,64,64,64',\n", - " '--repeat_in_block', '1,1,1,1',\n", - " '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n", - " '--residual_connection', '0,0,0,0',\n", - " '--first_conv_filters', '32',\n", - " '--first_conv_kernel_size', '5',\n", - " '--stride', '2'\n", - " ]\n", - " runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n", - "finally:\n", - " sys.argv = original_argv\n", - "print(\"āœ… Training and testing complete.\")\n" + "# Attempt 1: GPU (normal visibility)\n", + "cp = run_training(\n", + " \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n", + " extra_env={}, # no override\n", + ")\n", + "\n", + "if cp.returncode == 0:\n", + " print(\"āœ… Training and testing complete (GPU path).\")\n", + "else:\n", + " out_l = (cp.stdout or \"\").lower()\n", + " looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n", + "\n", + " if looks_like_gpu_oom:\n", + " # Attempt 2: CPU fallback (hide GPUs completely)\n", + " cp2 = run_training(\n", + " \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n", + " extra_env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"\", # hard-disable GPU\n", + " # (Optional) makes TF less chatty about GPU init on some builds:\n", + " \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n", + " },\n", + " )\n", + " if cp2.returncode == 0:\n", + " print(\"āœ… Training and testing complete (CPU fallback).\")\n", + " else:\n", + " raise RuntimeError(\n", + " \"Training failed on BOTH GPU and CPU.\\n\\n\"\n", + " + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\" \")\n", + " )\n", + " else:\n", + " # Not an OOM-style failure: surface the original error\n", + " raise RuntimeError(\n", + " \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n", + " + textwrap.indent(cp.stdout or \"(no output)\", prefix=\" \")\n", + " )" ] }, { @@ -962,6 +1008,13 @@ "\"\"\"\n", "display(HTML(html))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {