From dc92dc7d8b1ff2f2e739f75afc00683337d1e65d Mon Sep 17 00:00:00 2001
From: George Joseph <g.devel@wxy78.net>
Date: Sat, 20 Dec 2025 10:22:33 -0700
Subject: [PATCH] Update notebook to fix issues with environment inheritance.

Two issues:

* The notebook cell that actually runs model_train_eval was running it in a
  subprocess so while it inherited environment variables from the running
  python kernel, it couldn't inherit the tensorflow environment from it.
  This resulted in the `set_memory_growth(g, True)` and
  `mixed_precision.set_global_policy("mixed_float16")` calls in the previous
  cell to be lost.

* TFlite doesn't support "mixed_float16" anyway and causes the model export to
  fail spectacularly so it's kind of a good thing it wasn't being applied.

So..

* The tensorflow environment variable and memory_growth setting code was moved
  from the notebook cell that also wrote the config yaml to the next cell
  which does the train and test.  This leaves the "config" cell to just write
  the yaml.  This is really just a cosmetic change to group functionality
  better.

* The code that tried to set "mixed_float16" has been removed but since setting
  memory_growth to true is a good thing, the model_train_eval is now run using
  runpy instead in a subprocess.  This way it's run in the same python kernel
  instance and tensorflow environment as the rest of the notebook and inherits
  the memory_growth setting.

Resolves: #14
---
 microWakeWord_training_notebook.ipynb | 116 ++++++++++++--------------
 1 file changed, 52 insertions(+), 64 deletions(-)

diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb
index 9b5d728..5fe2110 100644
--- a/microWakeWord_training_notebook.ipynb
+++ b/microWakeWord_training_notebook.ipynb
@@ -742,36 +742,9 @@
    },
    "outputs": [],
    "source": [
-    "# GPU memory config (set env BEFORE importing TF)\n",
-    "import os, sys, gc\n",
-    "\n",
-    "if \"tensorflow\" not in sys.modules:\n",
-    "    os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\"              # grow as needed\n",
-    "    os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"          # modern CUDA allocator\n",
-    "    os.environ[\"XLA_FLAGS\"] = \"--xla_gpu_cuda_data_dir=/usr/local/cuda\"\n",
-    "    os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\"            # disable XLA JIT (more stable mem)\n",
-    "import tensorflow as tf\n",
-    "\n",
-    "# Per-device memory growth (belt + suspenders)\n",
-    "for g in tf.config.list_physical_devices(\"GPU\"):\n",
-    "    try:\n",
-    "        tf.config.experimental.set_memory_growth(g, True)\n",
-    "    except Exception:\n",
-    "        pass\n",
-    "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
-    "gc.collect()\n",
-    "\n",
-    "# Optional but recommended: mixed precision halves activation memory\n",
-    "try:\n",
-    "    from tensorflow.keras import mixed_precision\n",
-    "    mixed_precision.set_global_policy(\"mixed_float16\")\n",
-    "    print(\"Mixed precision policy:\", mixed_precision.global_policy())\n",
-    "except Exception as e:\n",
-    "    print(\"Mixed precision not enabled:\", e)\n",
-    "\n",
     "# --- Save a yaml config that controls the training process ---\n",
     "\n",
-    "import yaml\n",
+    "import os, sys, yaml\n",
     "\n",
     "config = {}\n",
     "\n",
@@ -809,7 +782,7 @@
     "with open(\"training_parameters.yaml\", \"w\") as f:\n",
     "    yaml.dump(config, f)\n",
     "\n",
-    "print(\"✅ Wrote training_parameters.yaml (batch_size=16) with allow_growth, cuda_malloc_async, XLA JIT OFF, mixed precision ON.\")"
+    "print(\"✅ Wrote training_parameters.yaml (batch_size=16)\")"
    ]
   },
   {
@@ -822,44 +795,59 @@
    "source": [
     "# Train + export (GPU-friendly env + stable flags)\n",
     "\n",
-    "import os, sys\n",
-    "\n",
-    "# --- Runtime env (inherited by the subprocess we're about to launch) ---\n",
-    "os.environ.setdefault(\"LD_LIBRARY_PATH\",\n",
-    "    \"/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu:\" +\n",
-    "    os.environ.get(\"LD_LIBRARY_PATH\",\"\")\n",
-    ")\n",
-    "os.environ.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")            # quieter logs\n",
-    "os.environ.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")    # grow VRAM as needed\n",
-    "os.environ.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")# modern allocator\n",
-    "os.environ.setdefault(\"XLA_FLAGS\", \"--xla_gpu_cuda_data_dir=/usr/local/cuda\")\n",
-    "os.environ.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\")  # disable XLA JIT (more stable)\n",
-    "os.environ.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\")            # allow TF32 (perf/VRAM win on Ampere+)\n",
+    "import os, sys, gc, runpy\n",
     "\n",
+    "if \"tensorflow\" not in sys.modules:\n",
+    "    os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\"              # grow as needed\n",
+    "    os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"          # modern CUDA allocator\n",
+    "    os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\"            # disable XLA JIT (more stable mem)\n",
+    "    os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\"                      # quieter logs\n",
+    "    os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\"                      # allow TF32 (perf/VRAM win on Ampere+)\n",
     "# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
-    "# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
+    "#  os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
     "\n",
-    "# --- Kick off training ---\n",
-    "cmd = f'''\"{sys.executable}\" -m microwakeword.model_train_eval \\\n",
-    "  --training_config=\"training_parameters.yaml\" \\\n",
-    "  --train 1 \\\n",
-    "  --restore_checkpoint 1 \\\n",
-    "  --test_tf_nonstreaming 0 \\\n",
-    "  --test_tflite_nonstreaming 0 \\\n",
-    "  --test_tflite_nonstreaming_quantized 0 \\\n",
-    "  --test_tflite_streaming 0 \\\n",
-    "  --test_tflite_streaming_quantized 1 \\\n",
-    "  --use_weights \"best_weights\" \\\n",
-    "  mixednet \\\n",
-    "  --pointwise_filters \"64,64,64,64\" \\\n",
-    "  --repeat_in_block \"1,1,1,1\" \\\n",
-    "  --mixconv_kernel_sizes \"[5], [7,11], [9,15], [23]\" \\\n",
-    "  --residual_connection \"0,0,0,0\" \\\n",
-    "  --first_conv_filters 32 \\\n",
-    "  --first_conv_kernel_size 5 \\\n",
-    "  --stride 2'''\n",
-    "print(\"Running:\\n\", cmd)\n",
-    "!$cmd"
+    "import tensorflow as tf\n",
+    "\n",
+    "allow_growth = \"\"\n",
+    "# Per-device memory growth (belt + suspenders)\n",
+    "for g in tf.config.list_physical_devices(\"GPU\"):\n",
+    "    try:\n",
+    "        tf.config.experimental.set_memory_growth(g, True)\n",
+    "        allow_growth = \"gpu_allow_growth, \"\n",
+    "    except Exception:\n",
+    "        pass\n",
+    "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
+    "gc.collect()\n",
+    "\n",
+    "print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
+    "print(\"   Starting training...\")\n",
+    "\n",
+    "original_argv = list(sys.argv)\n",
+    "try:\n",
+    "    sys.argv = [\n",
+    "        'model_train_eval.py',\n",
+    "        '--training_config', 'training_parameters.yaml',\n",
+    "        '--train', '1',\n",
+    "        '--restore_checkpoint', '1',\n",
+    "        '--test_tf_nonstreaming', '0',\n",
+    "        '--test_tflite_nonstreaming', '0',\n",
+    "        '--test_tflite_nonstreaming_quantized', '0',\n",
+    "        '--test_tflite_streaming', '0',\n",
+    "        '--test_tflite_streaming_quantized', '1',\n",
+    "        '--use_weights', 'best_weights',\n",
+    "        'mixednet',\n",
+    "        '--pointwise_filters', '64,64,64,64',\n",
+    "        '--repeat_in_block', '1,1,1,1',\n",
+    "        '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
+    "        '--residual_connection', '0,0,0,0',\n",
+    "        '--first_conv_filters', '32',\n",
+    "        '--first_conv_kernel_size', '5',\n",
+    "        '--stride', '2'\n",
+    "    ]\n",
+    "    runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
+    "finally:\n",
+    "    sys.argv = original_argv\n",
+    "print(\"✅ Training and testing complete.\")\n"
    ]
   },
   {