mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
Update notebook to fix issues with environment inheritance.
Two issues:
* The notebook cell that actually runs model_train_eval was running it in a
subprocess so while it inherited environment variables from the running
python kernel, it couldn't inherit the tensorflow environment from it.
This resulted in the `set_memory_growth(g, True)` and
`mixed_precision.set_global_policy("mixed_float16")` calls in the previous
cell to be lost.
* TFlite doesn't support "mixed_float16" anyway and causes the model export to
fail spectacularly so it's kind of a good thing it wasn't being applied.
So..
* The tensorflow environment variable and memory_growth setting code was moved
from the notebook cell that also wrote the config yaml to the next cell
which does the train and test. This leaves the "config" cell to just write
the yaml. This is really just a cosmetic change to group functionality
better.
* The code that tried to set "mixed_float16" has been removed but since setting
memory_growth to true is a good thing, the model_train_eval is now run using
runpy instead in a subprocess. This way it's run in the same python kernel
instance and tensorflow environment as the rest of the notebook and inherits
the memory_growth setting.
Resolves: #14
This commit is contained in:
@@ -742,36 +742,9 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# GPU memory config (set env BEFORE importing TF)\n",
|
|
||||||
"import os, sys, gc\n",
|
|
||||||
"\n",
|
|
||||||
"if \"tensorflow\" not in sys.modules:\n",
|
|
||||||
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
|
|
||||||
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
|
|
||||||
" os.environ[\"XLA_FLAGS\"] = \"--xla_gpu_cuda_data_dir=/usr/local/cuda\"\n",
|
|
||||||
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
|
|
||||||
"import tensorflow as tf\n",
|
|
||||||
"\n",
|
|
||||||
"# Per-device memory growth (belt + suspenders)\n",
|
|
||||||
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
|
|
||||||
" try:\n",
|
|
||||||
" tf.config.experimental.set_memory_growth(g, True)\n",
|
|
||||||
" except Exception:\n",
|
|
||||||
" pass\n",
|
|
||||||
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
|
|
||||||
"gc.collect()\n",
|
|
||||||
"\n",
|
|
||||||
"# Optional but recommended: mixed precision halves activation memory\n",
|
|
||||||
"try:\n",
|
|
||||||
" from tensorflow.keras import mixed_precision\n",
|
|
||||||
" mixed_precision.set_global_policy(\"mixed_float16\")\n",
|
|
||||||
" print(\"Mixed precision policy:\", mixed_precision.global_policy())\n",
|
|
||||||
"except Exception as e:\n",
|
|
||||||
" print(\"Mixed precision not enabled:\", e)\n",
|
|
||||||
"\n",
|
|
||||||
"# --- Save a yaml config that controls the training process ---\n",
|
"# --- Save a yaml config that controls the training process ---\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import yaml\n",
|
"import os, sys, yaml\n",
|
||||||
"\n",
|
"\n",
|
||||||
"config = {}\n",
|
"config = {}\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -809,7 +782,7 @@
|
|||||||
"with open(\"training_parameters.yaml\", \"w\") as f:\n",
|
"with open(\"training_parameters.yaml\", \"w\") as f:\n",
|
||||||
" yaml.dump(config, f)\n",
|
" yaml.dump(config, f)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"✅ Wrote training_parameters.yaml (batch_size=16) with allow_growth, cuda_malloc_async, XLA JIT OFF, mixed precision ON.\")"
|
"print(\"✅ Wrote training_parameters.yaml (batch_size=16)\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -822,44 +795,59 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Train + export (GPU-friendly env + stable flags)\n",
|
"# Train + export (GPU-friendly env + stable flags)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import os, sys\n",
|
"import os, sys, gc, runpy\n",
|
||||||
"\n",
|
|
||||||
"# --- Runtime env (inherited by the subprocess we're about to launch) ---\n",
|
|
||||||
"os.environ.setdefault(\"LD_LIBRARY_PATH\",\n",
|
|
||||||
" \"/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu:\" +\n",
|
|
||||||
" os.environ.get(\"LD_LIBRARY_PATH\",\"\")\n",
|
|
||||||
")\n",
|
|
||||||
"os.environ.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\") # quieter logs\n",
|
|
||||||
"os.environ.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\") # grow VRAM as needed\n",
|
|
||||||
"os.environ.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")# modern allocator\n",
|
|
||||||
"os.environ.setdefault(\"XLA_FLAGS\", \"--xla_gpu_cuda_data_dir=/usr/local/cuda\")\n",
|
|
||||||
"os.environ.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable)\n",
|
|
||||||
"os.environ.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
"if \"tensorflow\" not in sys.modules:\n",
|
||||||
|
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
|
||||||
|
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
|
||||||
|
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
|
||||||
|
" os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n",
|
||||||
|
" os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n",
|
||||||
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
|
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
|
||||||
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
|
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# --- Kick off training ---\n",
|
"import tensorflow as tf\n",
|
||||||
"cmd = f'''\"{sys.executable}\" -m microwakeword.model_train_eval \\\n",
|
"\n",
|
||||||
" --training_config=\"training_parameters.yaml\" \\\n",
|
"allow_growth = \"\"\n",
|
||||||
" --train 1 \\\n",
|
"# Per-device memory growth (belt + suspenders)\n",
|
||||||
" --restore_checkpoint 1 \\\n",
|
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
|
||||||
" --test_tf_nonstreaming 0 \\\n",
|
" try:\n",
|
||||||
" --test_tflite_nonstreaming 0 \\\n",
|
" tf.config.experimental.set_memory_growth(g, True)\n",
|
||||||
" --test_tflite_nonstreaming_quantized 0 \\\n",
|
" allow_growth = \"gpu_allow_growth, \"\n",
|
||||||
" --test_tflite_streaming 0 \\\n",
|
" except Exception:\n",
|
||||||
" --test_tflite_streaming_quantized 1 \\\n",
|
" pass\n",
|
||||||
" --use_weights \"best_weights\" \\\n",
|
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
|
||||||
" mixednet \\\n",
|
"gc.collect()\n",
|
||||||
" --pointwise_filters \"64,64,64,64\" \\\n",
|
"\n",
|
||||||
" --repeat_in_block \"1,1,1,1\" \\\n",
|
"print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
|
||||||
" --mixconv_kernel_sizes \"[5], [7,11], [9,15], [23]\" \\\n",
|
"print(\" Starting training...\")\n",
|
||||||
" --residual_connection \"0,0,0,0\" \\\n",
|
"\n",
|
||||||
" --first_conv_filters 32 \\\n",
|
"original_argv = list(sys.argv)\n",
|
||||||
" --first_conv_kernel_size 5 \\\n",
|
"try:\n",
|
||||||
" --stride 2'''\n",
|
" sys.argv = [\n",
|
||||||
"print(\"Running:\\n\", cmd)\n",
|
" 'model_train_eval.py',\n",
|
||||||
"!$cmd"
|
" '--training_config', 'training_parameters.yaml',\n",
|
||||||
|
" '--train', '1',\n",
|
||||||
|
" '--restore_checkpoint', '1',\n",
|
||||||
|
" '--test_tf_nonstreaming', '0',\n",
|
||||||
|
" '--test_tflite_nonstreaming', '0',\n",
|
||||||
|
" '--test_tflite_nonstreaming_quantized', '0',\n",
|
||||||
|
" '--test_tflite_streaming', '0',\n",
|
||||||
|
" '--test_tflite_streaming_quantized', '1',\n",
|
||||||
|
" '--use_weights', 'best_weights',\n",
|
||||||
|
" 'mixednet',\n",
|
||||||
|
" '--pointwise_filters', '64,64,64,64',\n",
|
||||||
|
" '--repeat_in_block', '1,1,1,1',\n",
|
||||||
|
" '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
|
||||||
|
" '--residual_connection', '0,0,0,0',\n",
|
||||||
|
" '--first_conv_filters', '32',\n",
|
||||||
|
" '--first_conv_kernel_size', '5',\n",
|
||||||
|
" '--stride', '2'\n",
|
||||||
|
" ]\n",
|
||||||
|
" runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
|
||||||
|
"finally:\n",
|
||||||
|
" sys.argv = original_argv\n",
|
||||||
|
"print(\"✅ Training and testing complete.\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user