mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
training live streaming
This commit is contained in:
@@ -853,6 +853,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
|
"# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
|
||||||
|
"# (LIVE streaming output + full log capture for error detection)\n",
|
||||||
"import os, sys, subprocess, textwrap\n",
|
"import os, sys, subprocess, textwrap\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# ---- Common TF env (applies to BOTH attempts) ----\n",
|
"# ---- Common TF env (applies to BOTH attempts) ----\n",
|
||||||
@@ -903,20 +904,39 @@
|
|||||||
" \"failed call to cuinit\",\n",
|
" \"failed call to cuinit\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n",
|
"class RunResult:\n",
|
||||||
|
" def __init__(self, returncode: int, stdout: str):\n",
|
||||||
|
" self.returncode = returncode\n",
|
||||||
|
" self.stdout = stdout\n",
|
||||||
|
"\n",
|
||||||
|
"def run_training(label: str, extra_env: dict) -> RunResult:\n",
|
||||||
" env = base_env.copy()\n",
|
" env = base_env.copy()\n",
|
||||||
" env.update(extra_env or {})\n",
|
" env.update(extra_env or {})\n",
|
||||||
|
"\n",
|
||||||
" print(f\"\\n🚀 {label}\")\n",
|
" print(f\"\\n🚀 {label}\")\n",
|
||||||
" print(\"→\", \" \".join([sys.executable] + train_args))\n",
|
" print(\"→\", \" \".join([sys.executable] + train_args))\n",
|
||||||
" cp = subprocess.run(\n",
|
"\n",
|
||||||
|
" proc = subprocess.Popen(\n",
|
||||||
" [sys.executable] + train_args,\n",
|
" [sys.executable] + train_args,\n",
|
||||||
" env=env,\n",
|
" env=env,\n",
|
||||||
" text=True,\n",
|
" text=True,\n",
|
||||||
" stdout=subprocess.PIPE,\n",
|
" stdout=subprocess.PIPE,\n",
|
||||||
" stderr=subprocess.STDOUT,\n",
|
" stderr=subprocess.STDOUT,\n",
|
||||||
|
" bufsize=1, # line-buffered (best effort)\n",
|
||||||
|
" universal_newlines=True,\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" print(cp.stdout)\n",
|
"\n",
|
||||||
" return cp\n",
|
" full_log = []\n",
|
||||||
|
" try:\n",
|
||||||
|
" # Stream lines live AND capture them for OOM detection / error messages\n",
|
||||||
|
" assert proc.stdout is not None\n",
|
||||||
|
" for line in proc.stdout:\n",
|
||||||
|
" print(line, end=\"\")\n",
|
||||||
|
" full_log.append(line)\n",
|
||||||
|
" finally:\n",
|
||||||
|
" returncode = proc.wait()\n",
|
||||||
|
"\n",
|
||||||
|
" return RunResult(returncode, \"\".join(full_log))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Attempt 1: GPU (normal visibility)\n",
|
"# Attempt 1: GPU (normal visibility)\n",
|
||||||
"cp = run_training(\n",
|
"cp = run_training(\n",
|
||||||
|
|||||||
Reference in New Issue
Block a user