Update notebook to fix issues with environment inheritance.

Two issues:

* The notebook cell that actually runs model_train_eval was running it in a
  subprocess so while it inherited environment variables from the running
  python kernel, it couldn't inherit the tensorflow environment from it.
  This resulted in the `set_memory_growth(g, True)` and
  `mixed_precision.set_global_policy("mixed_float16")` calls in the previous
  cell to be lost.

* TFlite doesn't support "mixed_float16" anyway and causes the model export to
  fail spectacularly so it's kind of a good thing it wasn't being applied.

So..

* The tensorflow environment variable and memory_growth setting code was moved
  from the notebook cell that also wrote the config yaml to the next cell
  which does the train and test.  This leaves the "config" cell to just write
  the yaml.  This is really just a cosmetic change to group functionality
  better.

* The code that tried to set "mixed_float16" has been removed but since setting
  memory_growth to true is a good thing, the model_train_eval is now run using
  runpy instead in a subprocess.  This way it's run in the same python kernel
  instance and tensorflow environment as the rest of the notebook and inherits
  the memory_growth setting.

Resolves: #14
This commit is contained in:
George Joseph
2025-12-20 10:22:33 -07:00
parent 5487f0869e
commit dc92dc7d8b

View File

@@ -742,36 +742,9 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# GPU memory config (set env BEFORE importing TF)\n",
"import os, sys, gc\n",
"\n",
"if \"tensorflow\" not in sys.modules:\n",
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
" os.environ[\"XLA_FLAGS\"] = \"--xla_gpu_cuda_data_dir=/usr/local/cuda\"\n",
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
"import tensorflow as tf\n",
"\n",
"# Per-device memory growth (belt + suspenders)\n",
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
" try:\n",
" tf.config.experimental.set_memory_growth(g, True)\n",
" except Exception:\n",
" pass\n",
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
"gc.collect()\n",
"\n",
"# Optional but recommended: mixed precision halves activation memory\n",
"try:\n",
" from tensorflow.keras import mixed_precision\n",
" mixed_precision.set_global_policy(\"mixed_float16\")\n",
" print(\"Mixed precision policy:\", mixed_precision.global_policy())\n",
"except Exception as e:\n",
" print(\"Mixed precision not enabled:\", e)\n",
"\n",
"# --- Save a yaml config that controls the training process ---\n", "# --- Save a yaml config that controls the training process ---\n",
"\n", "\n",
"import yaml\n", "import os, sys, yaml\n",
"\n", "\n",
"config = {}\n", "config = {}\n",
"\n", "\n",
@@ -809,7 +782,7 @@
"with open(\"training_parameters.yaml\", \"w\") as f:\n", "with open(\"training_parameters.yaml\", \"w\") as f:\n",
" yaml.dump(config, f)\n", " yaml.dump(config, f)\n",
"\n", "\n",
"print(\"✅ Wrote training_parameters.yaml (batch_size=16) with allow_growth, cuda_malloc_async, XLA JIT OFF, mixed precision ON.\")" "print(\"✅ Wrote training_parameters.yaml (batch_size=16)\")"
] ]
}, },
{ {
@@ -822,44 +795,59 @@
"source": [ "source": [
"# Train + export (GPU-friendly env + stable flags)\n", "# Train + export (GPU-friendly env + stable flags)\n",
"\n", "\n",
"import os, sys\n", "import os, sys, gc, runpy\n",
"\n",
"# --- Runtime env (inherited by the subprocess we're about to launch) ---\n",
"os.environ.setdefault(\"LD_LIBRARY_PATH\",\n",
" \"/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu:\" +\n",
" os.environ.get(\"LD_LIBRARY_PATH\",\"\")\n",
")\n",
"os.environ.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\") # quieter logs\n",
"os.environ.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\") # grow VRAM as needed\n",
"os.environ.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")# modern allocator\n",
"os.environ.setdefault(\"XLA_FLAGS\", \"--xla_gpu_cuda_data_dir=/usr/local/cuda\")\n",
"os.environ.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable)\n",
"os.environ.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
"\n", "\n",
"if \"tensorflow\" not in sys.modules:\n",
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
" os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n",
" os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n",
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n", "# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n", "# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
"\n", "\n",
"# --- Kick off training ---\n", "import tensorflow as tf\n",
"cmd = f'''\"{sys.executable}\" -m microwakeword.model_train_eval \\\n", "\n",
" --training_config=\"training_parameters.yaml\" \\\n", "allow_growth = \"\"\n",
" --train 1 \\\n", "# Per-device memory growth (belt + suspenders)\n",
" --restore_checkpoint 1 \\\n", "for g in tf.config.list_physical_devices(\"GPU\"):\n",
" --test_tf_nonstreaming 0 \\\n", " try:\n",
" --test_tflite_nonstreaming 0 \\\n", " tf.config.experimental.set_memory_growth(g, True)\n",
" --test_tflite_nonstreaming_quantized 0 \\\n", " allow_growth = \"gpu_allow_growth, \"\n",
" --test_tflite_streaming 0 \\\n", " except Exception:\n",
" --test_tflite_streaming_quantized 1 \\\n", " pass\n",
" --use_weights \"best_weights\" \\\n", "print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
" mixednet \\\n", "gc.collect()\n",
" --pointwise_filters \"64,64,64,64\" \\\n", "\n",
" --repeat_in_block \"1,1,1,1\" \\\n", "print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
" --mixconv_kernel_sizes \"[5], [7,11], [9,15], [23]\" \\\n", "print(\" Starting training...\")\n",
" --residual_connection \"0,0,0,0\" \\\n", "\n",
" --first_conv_filters 32 \\\n", "original_argv = list(sys.argv)\n",
" --first_conv_kernel_size 5 \\\n", "try:\n",
" --stride 2'''\n", " sys.argv = [\n",
"print(\"Running:\\n\", cmd)\n", " 'model_train_eval.py',\n",
"!$cmd" " '--training_config', 'training_parameters.yaml',\n",
" '--train', '1',\n",
" '--restore_checkpoint', '1',\n",
" '--test_tf_nonstreaming', '0',\n",
" '--test_tflite_nonstreaming', '0',\n",
" '--test_tflite_nonstreaming_quantized', '0',\n",
" '--test_tflite_streaming', '0',\n",
" '--test_tflite_streaming_quantized', '1',\n",
" '--use_weights', 'best_weights',\n",
" 'mixednet',\n",
" '--pointwise_filters', '64,64,64,64',\n",
" '--repeat_in_block', '1,1,1,1',\n",
" '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
" '--residual_connection', '0,0,0,0',\n",
" '--first_conv_filters', '32',\n",
" '--first_conv_kernel_size', '5',\n",
" '--stride', '2'\n",
" ]\n",
" runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
"finally:\n",
" sys.argv = original_argv\n",
"print(\"✅ Training and testing complete.\")\n"
] ]
}, },
{ {