training cpu fallback

This commit is contained in:
MasterPhooey
2025-12-31 07:51:06 -06:00
parent 4be423b00b
commit e47b6c11c2

View File

@@ -852,61 +852,107 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# Train + export (GPU-friendly env + stable flags)\n", "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
"import os, sys, subprocess, textwrap\n",
"\n", "\n",
"import os, sys, gc, runpy\n", "# ---- Common TF env (applies to BOTH attempts) ----\n",
"base_env = os.environ.copy()\n",
"base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n",
"base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable mem)\n",
"base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
"\n", "\n",
"if \"tensorflow\" not in sys.modules:\n", "# These only matter when a GPU is visible:\n",
" os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\" # grow as needed\n", "base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n",
" os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\" # modern CUDA allocator\n", "base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
" os.environ[\"TF_XLA_FLAGS\"] = \"--tf_xla_auto_jit=0\" # disable XLA JIT (more stable mem)\n", "# Optional (uncomment if you want a smaller cuDNN workspace):\n",
" os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\" # quieter logs\n", "# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n",
" os.environ[\"NVIDIA_TF32_OVERRIDE\"] = \"1\" # allow TF32 (perf/VRAM win on Ampere+)\n",
"# If you still hit GPU memory errors, uncomment to force a smaller workspace:\n",
"# os.environ[\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\"] = \"256\"\n",
"\n", "\n",
"import tensorflow as tf\n", "# ---- Training argv (same as your runpy args) ----\n",
"train_args = [\n",
" \"-m\", \"microwakeword.model_train_eval\",\n",
" \"--training_config\", \"training_parameters.yaml\",\n",
" \"--train\", \"1\",\n",
" \"--restore_checkpoint\", \"1\",\n",
" \"--test_tf_nonstreaming\", \"0\",\n",
" \"--test_tflite_nonstreaming\", \"0\",\n",
" \"--test_tflite_nonstreaming_quantized\", \"0\",\n",
" \"--test_tflite_streaming\", \"0\",\n",
" \"--test_tflite_streaming_quantized\", \"1\",\n",
" \"--use_weights\", \"best_weights\",\n",
" \"mixednet\",\n",
" \"--pointwise_filters\", \"64,64,64,64\",\n",
" \"--repeat_in_block\", \"1,1,1,1\",\n",
" \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n",
" \"--residual_connection\", \"0,0,0,0\",\n",
" \"--first_conv_filters\", \"32\",\n",
" \"--first_conv_kernel_size\", \"5\",\n",
" \"--stride\", \"2\",\n",
"]\n",
"\n", "\n",
"allow_growth = \"\"\n", "OOM_MARKERS = (\n",
"# Per-device memory growth (belt + suspenders)\n", " \"resourceexhaustederror\",\n",
"for g in tf.config.list_physical_devices(\"GPU\"):\n", " \"resource exhausted\",\n",
" try:\n", " \"oom\",\n",
" tf.config.experimental.set_memory_growth(g, True)\n", " \"out of memory\",\n",
" allow_growth = \"gpu_allow_growth, \"\n", " \"cuda_error_out_of_memory\",\n",
" except Exception:\n", " \"cudnn\",\n",
" pass\n", " \"failed to allocate\",\n",
"print(\"GPUs:\", tf.config.list_physical_devices(\"GPU\"))\n", " \"blas xgemm\",\n",
"gc.collect()\n", " \"cublas\",\n",
" \"internalerror: cuda\",\n",
" \"failed call to cuinit\",\n",
")\n",
"\n", "\n",
"print(f\"✅ Set environment with {allow_growth}cuda_malloc_async, xla_auto_jit=0, min_log_level=2, nvidia_tf2_override\")\n", "def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n",
"print(\" Starting training...\")\n", " env = base_env.copy()\n",
" env.update(extra_env or {})\n",
" print(f\"\\n🚀 {label}\")\n",
" print(\"→\", \" \".join([sys.executable] + train_args))\n",
" cp = subprocess.run(\n",
" [sys.executable] + train_args,\n",
" env=env,\n",
" text=True,\n",
" stdout=subprocess.PIPE,\n",
" stderr=subprocess.STDOUT,\n",
" )\n",
" print(cp.stdout)\n",
" return cp\n",
"\n", "\n",
"original_argv = list(sys.argv)\n", "# Attempt 1: GPU (normal visibility)\n",
"try:\n", "cp = run_training(\n",
" sys.argv = [\n", " \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n",
" 'model_train_eval.py',\n", " extra_env={}, # no override\n",
" '--training_config', 'training_parameters.yaml',\n", ")\n",
" '--train', '1',\n", "\n",
" '--restore_checkpoint', '1',\n", "if cp.returncode == 0:\n",
" '--test_tf_nonstreaming', '0',\n", " print(\"✅ Training and testing complete (GPU path).\")\n",
" '--test_tflite_nonstreaming', '0',\n", "else:\n",
" '--test_tflite_nonstreaming_quantized', '0',\n", " out_l = (cp.stdout or \"\").lower()\n",
" '--test_tflite_streaming', '0',\n", " looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n",
" '--test_tflite_streaming_quantized', '1',\n", "\n",
" '--use_weights', 'best_weights',\n", " if looks_like_gpu_oom:\n",
" 'mixednet',\n", " # Attempt 2: CPU fallback (hide GPUs completely)\n",
" '--pointwise_filters', '64,64,64,64',\n", " cp2 = run_training(\n",
" '--repeat_in_block', '1,1,1,1',\n", " \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n",
" '--mixconv_kernel_sizes', '[5], [7,11], [9,15], [23]',\n", " extra_env={\n",
" '--residual_connection', '0,0,0,0',\n", " \"CUDA_VISIBLE_DEVICES\": \"\", # hard-disable GPU\n",
" '--first_conv_filters', '32',\n", " # (Optional) makes TF less chatty about GPU init on some builds:\n",
" '--first_conv_kernel_size', '5',\n", " \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n",
" '--stride', '2'\n", " },\n",
" ]\n", " )\n",
" runpy.run_module(\"microwakeword.model_train_eval\", run_name=\"__main__\", alter_sys=True)\n", " if cp2.returncode == 0:\n",
"finally:\n", " print(\"✅ Training and testing complete (CPU fallback).\")\n",
" sys.argv = original_argv\n", " else:\n",
"print(\"✅ Training and testing complete.\")\n" " raise RuntimeError(\n",
" \"Training failed on BOTH GPU and CPU.\\n\\n\"\n",
" + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\" \")\n",
" )\n",
" else:\n",
" # Not an OOM-style failure: surface the original error\n",
" raise RuntimeError(\n",
" \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n",
" + textwrap.indent(cp.stdout or \"(no output)\", prefix=\" \")\n",
" )"
] ]
}, },
{ {
@@ -962,6 +1008,13 @@
"\"\"\"\n", "\"\"\"\n",
"display(HTML(html))" "display(HTML(html))"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {