mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
Merge pull request #15 from gtjoseph/main-fix-notebook
Update notebook to fix issues with environment inheritance.
This commit is contained in:
@@ -742,36 +742,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# GPU memory config (set env BEFORE importing TF)\n",
|
||||
"import os, sys, gc\n",
|
||||
"\n",
|
||||
"if \"tensorflow\" not in sys.modules:\n",
|
||||
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
|
||||
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
|
||||
" os.environ[\"XLA_FLAGS\"] = \"--xla_gpu_cuda_data_dir=/usr/local/cuda\"\n",
|
||||
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
|
||||
"import tensorflow as tf\n",
|
||||
"\n",
|
||||
"# Per-device memory growth (belt + suspenders)\n",
|
||||
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
|
||||
" try:\n",
|
||||
" tf.config.experimental.set_memory_growth(g, True)\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
|
||||
"gc.collect()\n",
|
||||
"\n",
|
||||
"# Optional but recommended: mixed precision halves activation memory\n",
|
||||
"try:\n",
|
||||
" from tensorflow.keras import mixed_precision\n",
|
||||
" mixed_precision.set_global_policy(\"mixed_float16\")\n",
|
||||
" print(\"Mixed precision policy:\", mixed_precision.global_policy())\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Mixed precision not enabled:\", e)\n",
|
||||
"\n",
|
||||
"# --- Save a yaml config that controls the training process ---\n",
|
||||
"\n",
|
||||
"import yaml\n",
|
||||
"import os, sys, yaml\n",
|
||||
"\n",
|
||||
"config = {}\n",
|
||||
"\n",
|
||||
@@ -809,7 +782,7 @@
|
||||
"with open(\"training_parameters.yaml\", \"w\") as f:\n",
|
||||
" yaml.dump(config, f)\n",
|
||||
"\n",
|
||||
"print(\"✅ Wrote training_parameters.yaml (batch_size=16) with allow_growth, cuda_malloc_async, XLA JIT OFF, mixed precision ON.\")"
|
||||
"print(\"✅ Wrote training_parameters.yaml (batch_size=16)\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -822,44 +795,59 @@
|
||||
"source": [
|
||||
"# Train + export (GPU-friendly env + stable flags)\n",
|
||||
"\n",
|
||||
"import os, sys\n",
|
||||
"\n",
|
||||
"# --- Runtime env (inherited by the subprocess we're about to launch) ---\n",
|
||||
"os.environ.setdefault(\"LD_LIBRARY_PATH\",\n",
|
||||
" \"/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu:\" +\n",
|
||||
" os.environ.get(\"LD_LIBRARY_PATH\",\"\")\n",
|
||||
")\n",
|
||||
"os.environ.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\") # quieter logs\n",
|
||||
"os.environ.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\") # grow VRAM as needed\n",
|
||||
"os.environ.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")# modern allocator\n",
|
||||
"os.environ.setdefault(\"XLA_FLAGS\", \"--xla_gpu_cuda_data_dir=/usr/local/cuda\")\n",
|
||||
"os.environ.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable)\n",
|
||||
"os.environ.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
|
||||
"import os, sys, gc, runpy\n",
|
||||
"\n",
|
||||
"if \"tensorflow\" not in sys.modules:\n",
|
||||
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
|
||||
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
|
||||
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
|
||||
" os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n",
|
||||
" os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n",
|
||||
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
|
||||
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
|
||||
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
|
||||
"\n",
|
||||
"# --- Kick off training ---\n",
|
||||
"cmd = f'''\"{sys.executable}\" -m microwakeword.model_train_eval \\\n",
|
||||
" --training_config=\"training_parameters.yaml\" \\\n",
|
||||
" --train 1 \\\n",
|
||||
" --restore_checkpoint 1 \\\n",
|
||||
" --test_tf_nonstreaming 0 \\\n",
|
||||
" --test_tflite_nonstreaming 0 \\\n",
|
||||
" --test_tflite_nonstreaming_quantized 0 \\\n",
|
||||
" --test_tflite_streaming 0 \\\n",
|
||||
" --test_tflite_streaming_quantized 1 \\\n",
|
||||
" --use_weights \"best_weights\" \\\n",
|
||||
" mixednet \\\n",
|
||||
" --pointwise_filters \"64,64,64,64\" \\\n",
|
||||
" --repeat_in_block \"1,1,1,1\" \\\n",
|
||||
" --mixconv_kernel_sizes \"[5], [7,11], [9,15], [23]\" \\\n",
|
||||
" --residual_connection \"0,0,0,0\" \\\n",
|
||||
" --first_conv_filters 32 \\\n",
|
||||
" --first_conv_kernel_size 5 \\\n",
|
||||
" --stride 2'''\n",
|
||||
"print(\"Running:\\n\", cmd)\n",
|
||||
"!$cmd"
|
||||
"import tensorflow as tf\n",
|
||||
"\n",
|
||||
"allow_growth = \"\"\n",
|
||||
"# Per-device memory growth (belt + suspenders)\n",
|
||||
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
|
||||
" try:\n",
|
||||
" tf.config.experimental.set_memory_growth(g, True)\n",
|
||||
" allow_growth = \"gpu_allow_growth, \"\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
|
||||
"gc.collect()\n",
|
||||
"\n",
|
||||
"print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
|
||||
"print(\" Starting training...\")\n",
|
||||
"\n",
|
||||
"original_argv = list(sys.argv)\n",
|
||||
"try:\n",
|
||||
" sys.argv = [\n",
|
||||
" 'model_train_eval.py',\n",
|
||||
" '--training_config', 'training_parameters.yaml',\n",
|
||||
" '--train', '1',\n",
|
||||
" '--restore_checkpoint', '1',\n",
|
||||
" '--test_tf_nonstreaming', '0',\n",
|
||||
" '--test_tflite_nonstreaming', '0',\n",
|
||||
" '--test_tflite_nonstreaming_quantized', '0',\n",
|
||||
" '--test_tflite_streaming', '0',\n",
|
||||
" '--test_tflite_streaming_quantized', '1',\n",
|
||||
" '--use_weights', 'best_weights',\n",
|
||||
" 'mixednet',\n",
|
||||
" '--pointwise_filters', '64,64,64,64',\n",
|
||||
" '--repeat_in_block', '1,1,1,1',\n",
|
||||
" '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
|
||||
" '--residual_connection', '0,0,0,0',\n",
|
||||
" '--first_conv_filters', '32',\n",
|
||||
" '--first_conv_kernel_size', '5',\n",
|
||||
" '--stride', '2'\n",
|
||||
" ]\n",
|
||||
" runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
|
||||
"finally:\n",
|
||||
" sys.argv = original_argv\n",
|
||||
"print(\"✅ Training and testing complete.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user