training cpu fallback

2026-06-12 20:10:19 -06:00 · 2025-12-31 07:51:06 -06:00
parent 4be423b00b
commit e47b6c11c2
1 changed files with 102 additions and 49 deletions
--- a/microWakeWord_training_notebook.ipynb
+++ b/microWakeWord_training_notebook.ipynb
@@ -852,61 +852,107 @@
   },
   "outputs": [],
   "source": [
-    "# Train + export (GPU-friendly env + stable flags)\n",
+    "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
    "import os, sys, subprocess, textwrap\n",
    "\n",
-    "import os, sys, gc, runpy\n",
+    "# ---- Common TF env (applies to BOTH attempts) ----\n",
    "base_env = os.environ.copy()\n",
    "base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n",
    "base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\")          # disable XLA JIT (more stable mem)\n",
    "base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\")                    # allow TF32 (perf/VRAM win on Ampere+)\n",
    "\n",
-    "if \"tensorflow\" not in sys.modules:\n",
+    "# These only matter when a GPU is visible:\n",
-    "    os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\"              # grow as needed\n",
+    "base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n",
-    "    os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"          # modern CUDA allocator\n",
+    "base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
-    "    os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\"            # disable XLA JIT (more stable mem)\n",
+    "# Optional (uncomment if you want a smaller cuDNN workspace):\n",
-    "    os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\"                      # quieter logs\n",
+    "# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n",
    "    os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\"                      # allow TF32 (perf/VRAM win on Ampere+)\n",
    "# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
    "#  os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
    "\n",
-    "import tensorflow as tf\n",
+    "# ---- Training argv (same as your runpy args) ----\n",
    "train_args = [\n",
    "    \"-m\", \"microwakeword.model_train_eval\",\n",
    "    \"--training_config\", \"training_parameters.yaml\",\n",
    "    \"--train\", \"1\",\n",
    "    \"--restore_checkpoint\", \"1\",\n",
    "    \"--test_tf_nonstreaming\", \"0\",\n",
    "    \"--test_tflite_nonstreaming\", \"0\",\n",
    "    \"--test_tflite_nonstreaming_quantized\", \"0\",\n",
    "    \"--test_tflite_streaming\", \"0\",\n",
    "    \"--test_tflite_streaming_quantized\", \"1\",\n",
    "    \"--use_weights\", \"best_weights\",\n",
    "    \"mixednet\",\n",
    "    \"--pointwise_filters\", \"64,64,64,64\",\n",
    "    \"--repeat_in_block\", \"1,1,1,1\",\n",
    "    \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n",
    "    \"--residual_connection\", \"0,0,0,0\",\n",
    "    \"--first_conv_filters\", \"32\",\n",
    "    \"--first_conv_kernel_size\", \"5\",\n",
    "    \"--stride\", \"2\",\n",
    "]\n",
    "\n",
-    "allow_growth = \"\"\n",
+    "OOM_MARKERS = (\n",
-    "# Per-device memory growth (belt + suspenders)\n",
+    "    \"resourceexhaustederror\",\n",
-    "for g in tf.config.list_physical_devices(\"GPU\"):\n",
+    "    \"resource exhausted\",\n",
-    "    try:\n",
+    "    \"oom\",\n",
-    "        tf.config.experimental.set_memory_growth(g, True)\n",
+    "    \"out of memory\",\n",
-    "        allow_growth = \"gpu_allow_growth, \"\n",
+    "    \"cuda_error_out_of_memory\",\n",
-    "    except Exception:\n",
+    "    \"cudnn\",\n",
-    "        pass\n",
+    "    \"failed to allocate\",\n",
-    "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
+    "    \"blas xgemm\",\n",
-    "gc.collect()\n",
+    "    \"cublas\",\n",
    "    \"internalerror: cuda\",\n",
    "    \"failed call to cuinit\",\n",
    ")\n",
    "\n",
-    "print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
+    "def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n",
-    "print(\"   Starting training...\")\n",
+    "    env = base_env.copy()\n",
    "    env.update(extra_env or {})\n",
    "    print(f\"\\n🚀 {label}\")\n",
    "    print(\"→\", \" \".join([sys.executable] + train_args))\n",
    "    cp = subprocess.run(\n",
    "        [sys.executable] + train_args,\n",
    "        env=env,\n",
    "        text=True,\n",
    "        stdout=subprocess.PIPE,\n",
    "        stderr=subprocess.STDOUT,\n",
    "    )\n",
    "    print(cp.stdout)\n",
    "    return cp\n",
    "\n",
-    "original_argv = list(sys.argv)\n",
+    "# Attempt 1: GPU (normal visibility)\n",
-    "try:\n",
+    "cp = run_training(\n",
-    "    sys.argv = [\n",
+    "    \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n",
-    "        'model_train_eval.py',\n",
+    "    extra_env={},  # no override\n",
-    "        '--training_config', 'training_parameters.yaml',\n",
+    ")\n",
-    "        '--train', '1',\n",
+    "\n",
-    "        '--restore_checkpoint', '1',\n",
+    "if cp.returncode == 0:\n",
-    "        '--test_tf_nonstreaming', '0',\n",
+    "    print(\"✅ Training and testing complete (GPU path).\")\n",
-    "        '--test_tflite_nonstreaming', '0',\n",
+    "else:\n",
-    "        '--test_tflite_nonstreaming_quantized', '0',\n",
+    "    out_l = (cp.stdout or \"\").lower()\n",
-    "        '--test_tflite_streaming', '0',\n",
+    "    looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n",
-    "        '--test_tflite_streaming_quantized', '1',\n",
+    "\n",
-    "        '--use_weights', 'best_weights',\n",
+    "    if looks_like_gpu_oom:\n",
-    "        'mixednet',\n",
+    "        # Attempt 2: CPU fallback (hide GPUs completely)\n",
-    "        '--pointwise_filters', '64,64,64,64',\n",
+    "        cp2 = run_training(\n",
-    "        '--repeat_in_block', '1,1,1,1',\n",
+    "            \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n",
-    "        '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
+    "            extra_env={\n",
-    "        '--residual_connection', '0,0,0,0',\n",
+    "                \"CUDA_VISIBLE_DEVICES\": \"\",   # hard-disable GPU\n",
-    "        '--first_conv_filters', '32',\n",
+    "                # (Optional) makes TF less chatty about GPU init on some builds:\n",
-    "        '--first_conv_kernel_size', '5',\n",
+    "                \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n",
-    "        '--stride', '2'\n",
+    "            },\n",
-    "    ]\n",
+    "        )\n",
-    "    runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
+    "        if cp2.returncode == 0:\n",
-    "finally:\n",
+    "            print(\"✅ Training and testing complete (CPU fallback).\")\n",
-    "    sys.argv = original_argv\n",
+    "        else:\n",
-    "print(\"✅ Training and testing complete.\")\n"
+    "            raise RuntimeError(\n",
    "                \"Training failed on BOTH GPU and CPU.\\n\\n\"\n",
    "                + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\"  \")\n",
    "            )\n",
    "    else:\n",
    "        # Not an OOM-style failure: surface the original error\n",
    "        raise RuntimeError(\n",
    "            \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n",
    "            + textwrap.indent(cp.stdout or \"(no output)\", prefix=\"  \")\n",
    "        )"
   ]
  },
  {
@@ -962,6 +1008,13 @@
    "\"\"\"\n",
    "display(HTML(html))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {