training live streaming

This commit is contained in:
MasterPhooey
2025-12-31 08:24:59 -06:00
parent e47b6c11c2
commit 3dd305e560

View File

@@ -853,6 +853,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n", "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
"# (LIVE streaming output + full log capture for error detection)\n",
"import os, sys, subprocess, textwrap\n", "import os, sys, subprocess, textwrap\n",
"\n", "\n",
"# ---- Common TF env (applies to BOTH attempts) ----\n", "# ---- Common TF env (applies to BOTH attempts) ----\n",
@@ -903,20 +904,39 @@
" \"failed call to cuinit\",\n", " \"failed call to cuinit\",\n",
")\n", ")\n",
"\n", "\n",
"def run_training(label: str, extra_env: dict) -> subprocess.CompletedProcess:\n", "class RunResult:\n",
" def __init__(self, returncode: int, stdout: str):\n",
" self.returncode = returncode\n",
" self.stdout = stdout\n",
"\n",
"def run_training(label: str, extra_env: dict) -> RunResult:\n",
" env = base_env.copy()\n", " env = base_env.copy()\n",
" env.update(extra_env or {})\n", " env.update(extra_env or {})\n",
"\n",
" print(f\"\\n🚀 {label}\")\n", " print(f\"\\n🚀 {label}\")\n",
" print(\"→\", \" \".join([sys.executable] + train_args))\n", " print(\"→\", \" \".join([sys.executable] + train_args))\n",
" cp = subprocess.run(\n", "\n",
" proc = subprocess.Popen(\n",
" [sys.executable] + train_args,\n", " [sys.executable] + train_args,\n",
" env=env,\n", " env=env,\n",
" text=True,\n", " text=True,\n",
" stdout=subprocess.PIPE,\n", " stdout=subprocess.PIPE,\n",
" stderr=subprocess.STDOUT,\n", " stderr=subprocess.STDOUT,\n",
" bufsize=1, # line-buffered (best effort)\n",
" universal_newlines=True,\n",
" )\n", " )\n",
" print(cp.stdout)\n", "\n",
" return cp\n", " full_log = []\n",
" try:\n",
" # Stream lines live AND capture them for OOM detection / error messages\n",
" assert proc.stdout is not None\n",
" for line in proc.stdout:\n",
" print(line, end=\"\")\n",
" full_log.append(line)\n",
" finally:\n",
" returncode = proc.wait()\n",
"\n",
" return RunResult(returncode, \"\".join(full_log))\n",
"\n", "\n",
"# Attempt 1: GPU (normal visibility)\n", "# Attempt 1: GPU (normal visibility)\n",
"cp = run_training(\n", "cp = run_training(\n",