mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
training cpu fallback
This commit is contained in:
@@ -852,61 +852,107 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Train + export (GPU-friendly env + stable flags)\n",
|
||||
"# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
|
||||
"import os, sys, subprocess, textwrap\n",
|
||||
"\n",
|
||||
"import os, sys, gc, runpy\n",
|
||||
"# ---- Common TF env (applies to BOTH attempts) ----\n",
|
||||
"base_env = os.environ.copy()\n",
|
||||
"base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n",
|
||||
"base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable mem)\n",
|
||||
"base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
|
||||
"\n",
|
||||
"if \"tensorflow\" not in sys.modules:\n",
|
||||
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n",
|
||||
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n",
|
||||
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n",
|
||||
" os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n",
|
||||
" os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n",
|
||||
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
|
||||
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
|
||||
"# These only matter when a GPU is visible:\n",
|
||||
"base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n",
|
||||
"base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
|
||||
"# Optional (uncomment if you want a smaller cuDNN workspace):\n",
|
||||
"# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n",
|
||||
"\n",
|
||||
"import tensorflow as tf\n",
|
||||
"\n",
|
||||
"allow_growth = \"\"\n",
|
||||
"# Per-device memory growth (belt + suspenders)\n",
|
||||
"for g in tf.config.list_physical_devices(\"GPU\"):\n",
|
||||
" try:\n",
|
||||
" tf.config.experimental.set_memory_growth(g, True)\n",
|
||||
" allow_growth = \"gpu_allow_growth, \"\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n",
|
||||
"gc.collect()\n",
|
||||
"\n",
|
||||
"print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n",
|
||||
"print(\" Starting training...\")\n",
|
||||
"\n",
|
||||
"original_argv = list(sys.argv)\n",
|
||||
"try:\n",
|
||||
" sys.argv = [\n",
|
||||
" 'model_train_eval.py',\n",
|
||||
" '--training_config', 'training_parameters.yaml',\n",
|
||||
" '--train', '1',\n",
|
||||
" '--restore_checkpoint', '1',\n",
|
||||
" '--test_tf_nonstreaming', '0',\n",
|
||||
" '--test_tflite_nonstreaming', '0',\n",
|
||||
" '--test_tflite_nonstreaming_quantized', '0',\n",
|
||||
" '--test_tflite_streaming', '0',\n",
|
||||
" '--test_tflite_streaming_quantized', '1',\n",
|
||||
" '--use_weights', 'best_weights',\n",
|
||||
" 'mixednet',\n",
|
||||
" '--pointwise_filters', '64,64,64,64',\n",
|
||||
" '--repeat_in_block', '1,1,1,1',\n",
|
||||
" '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n",
|
||||
" '--residual_connection', '0,0,0,0',\n",
|
||||
" '--first_conv_filters', '32',\n",
|
||||
" '--first_conv_kernel_size', '5',\n",
|
||||
" '--stride', '2'\n",
|
||||
"# ---- Training argv (same as your runpy args) ----\n",
|
||||
"train_args = [\n",
|
||||
" \"-m\", \"microwakeword.model_train_eval\",\n",
|
||||
" \"--training_config\", \"training_parameters.yaml\",\n",
|
||||
" \"--train\", \"1\",\n",
|
||||
" \"--restore_checkpoint\", \"1\",\n",
|
||||
" \"--test_tf_nonstreaming\", \"0\",\n",
|
||||
" \"--test_tflite_nonstreaming\", \"0\",\n",
|
||||
" \"--test_tflite_nonstreaming_quantized\", \"0\",\n",
|
||||
" \"--test_tflite_streaming\", \"0\",\n",
|
||||
" \"--test_tflite_streaming_quantized\", \"1\",\n",
|
||||
" \"--use_weights\", \"best_weights\",\n",
|
||||
" \"mixednet\",\n",
|
||||
" \"--pointwise_filters\", \"64,64,64,64\",\n",
|
||||
" \"--repeat_in_block\", \"1,1,1,1\",\n",
|
||||
" \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n",
|
||||
" \"--residual_connection\", \"0,0,0,0\",\n",
|
||||
" \"--first_conv_filters\", \"32\",\n",
|
||||
" \"--first_conv_kernel_size\", \"5\",\n",
|
||||
" \"--stride\", \"2\",\n",
|
||||
"]\n",
|
||||
" runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n",
|
||||
"finally:\n",
|
||||
" sys.argv = original_argv\n",
|
||||
"print(\"✅ Training and testing complete.\")\n"
|
||||
"\n",
|
||||
"OOM_MARKERS = (\n",
|
||||
" \"resourceexhaustederror\",\n",
|
||||
" \"resource exhausted\",\n",
|
||||
" \"oom\",\n",
|
||||
" \"out of memory\",\n",
|
||||
" \"cuda_error_out_of_memory\",\n",
|
||||
" \"cudnn\",\n",
|
||||
" \"failed to allocate\",\n",
|
||||
" \"blas xgemm\",\n",
|
||||
" \"cublas\",\n",
|
||||
" \"internalerror: cuda\",\n",
|
||||
" \"failed call to cuinit\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n",
|
||||
" env = base_env.copy()\n",
|
||||
" env.update(extra_env or {})\n",
|
||||
" print(f\"\\n🚀 {label}\")\n",
|
||||
" print(\"→\", \" \".join([sys.executable] + train_args))\n",
|
||||
" cp = subprocess.run(\n",
|
||||
" [sys.executable] + train_args,\n",
|
||||
" env=env,\n",
|
||||
" text=True,\n",
|
||||
" stdout=subprocess.PIPE,\n",
|
||||
" stderr=subprocess.STDOUT,\n",
|
||||
" )\n",
|
||||
" print(cp.stdout)\n",
|
||||
" return cp\n",
|
||||
"\n",
|
||||
"# Attempt 1: GPU (normal visibility)\n",
|
||||
"cp = run_training(\n",
|
||||
" \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n",
|
||||
" extra_env={}, # no override\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if cp.returncode == 0:\n",
|
||||
" print(\"✅ Training and testing complete (GPU path).\")\n",
|
||||
"else:\n",
|
||||
" out_l = (cp.stdout or \"\").lower()\n",
|
||||
" looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n",
|
||||
"\n",
|
||||
" if looks_like_gpu_oom:\n",
|
||||
" # Attempt 2: CPU fallback (hide GPUs completely)\n",
|
||||
" cp2 = run_training(\n",
|
||||
" \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n",
|
||||
" extra_env={\n",
|
||||
" \"CUDA_VISIBLE_DEVICES\": \"\", # hard-disable GPU\n",
|
||||
" # (Optional) makes TF less chatty about GPU init on some builds:\n",
|
||||
" \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" if cp2.returncode == 0:\n",
|
||||
" print(\"✅ Training and testing complete (CPU fallback).\")\n",
|
||||
" else:\n",
|
||||
" raise RuntimeError(\n",
|
||||
" \"Training failed on BOTH GPU and CPU.\\n\\n\"\n",
|
||||
" + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\" \")\n",
|
||||
" )\n",
|
||||
" else:\n",
|
||||
" # Not an OOM-style failure: surface the original error\n",
|
||||
" raise RuntimeError(\n",
|
||||
" \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n",
|
||||
" + textwrap.indent(cp.stdout or \"(no output)\", prefix=\" \")\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -962,6 +1008,13 @@
|
||||
"\"\"\"\n",
|
||||
"display(HTML(html))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user