Merge pull request #23 from TaterTotterson/codex/fix-training-errors-with-venv-setup

Improve CUDA/XLA setup and CPU fallback
This commit is contained in:
Tater Totterson
2026-01-20 20:48:50 -06:00
committed by GitHub
4 changed files with 38 additions and 44 deletions

View File

@@ -178,6 +178,26 @@ echo " ===== Installing keras ====="
# keras 3.13 has "issues" so we need to back down to 3.12. # keras 3.13 has "issues" so we need to back down to 3.12.
pip_install "keras==3.12.0" pip_install "keras==3.12.0"
CUDA_DATA_DIR="${DATA_DIR}/cuda"
LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice"
mkdir -p "${LIBDEVICE_DIR}"
TRITON_LIBDEVICE="$(
python - <<'PY'
import glob
import sys
paths = glob.glob("**/site-packages/triton/backends/nvidia/lib/libdevice.10.bc", recursive=True)
print(paths[0] if paths else "", end="")
PY
)"
if [ -n "${TRITON_LIBDEVICE}" ] ; then
ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc"
echo " Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}"
else
echo " ⚠️ Triton libdevice.10.bc not found; XLA may require --xla_gpu_cuda_data_dir"
fi
"${PROGDIR}/test_python" --data-dir="${DATA_DIR}" "${PROGDIR}/test_python" --data-dir="${DATA_DIR}"
touch .mww-data-dir touch .mww-data-dir
@@ -185,4 +205,4 @@ END_TS=$EPOCHSECONDS
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell." echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"

View File

@@ -204,22 +204,6 @@ TRAIN_ARGS=(
--stride 2 --stride 2
) )
GPU_FALLBACK_MARKERS=(
"resourceexhaustederror"
"resource exhausted"
"oom"
"out of memory"
"cuda_error_out_of_memory"
"failed to allocate"
"cudnn"
"cublas"
"internalerror: cuda"
"failed call to cuinit"
"dst tensor is not initialized"
"failed copying input tensor"
"_eagerconst"
)
run_attempt() { run_attempt() {
local label="$1" local label="$1"
shift shift
@@ -239,8 +223,11 @@ run_attempt() {
return ${PIPESTATUS[0]} return ${PIPESTATUS[0]}
} }
DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}" export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}" export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}" export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}" export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
@@ -248,30 +235,14 @@ export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
echo "✅ Training complete (GPU path)." echo "✅ Training complete (GPU path)."
else else
echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…" echo "⚠️ GPU attempt failed. Falling back to CPU."
log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)" export CUDA_VISIBLE_DEVICES=""
looks_like_gpu_fail="false" unset TF_GPU_ALLOCATOR
for m in "${GPU_FALLBACK_MARKERS[@]}"; do if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
if echo "${log_lc}" | grep -qF "${m}"; then echo "✅ Training complete (CPU fallback)."
looks_like_gpu_fail="true"
break
fi
done
if [ "${looks_like_gpu_fail}" = "true" ]; then
echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU."
export CUDA_VISIBLE_DEVICES=""
unset TF_GPU_ALLOCATOR
if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
echo "✅ Training complete (CPU fallback)."
else
echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
exit 1
fi
else else
echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2 echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
exit 1 exit 1
fi fi
fi fi
@@ -320,4 +291,4 @@ echo "Metadata: ${json_path}"
echo echo
END_TS=$EPOCHSECONDS END_TS=$EPOCHSECONDS
print_elapsed_time "${START_TS}" "${END_TS}" "Training completed." print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
echo echo

View File

@@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# System deps # System deps
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \ python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
git wget curl unzip ca-certificates nano less \ git wget curl unzip ca-certificates nano less nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& mkdir -p /data && mkdir -p /data

View File

@@ -70,7 +70,10 @@ START_TS=$EPOCHSECONDS
export TF_CPP_MIN_LOG_LEVEL=9 export TF_CPP_MIN_LOG_LEVEL=9
export TF_FORCE_GPU_ALLOW_GROWTH=true export TF_FORCE_GPU_ALLOW_GROWTH=true
export TF_GPU_ALLOCATOR=cuda_malloc_async export TF_GPU_ALLOCATOR=cuda_malloc_async
export TF_XLA_FLAGS="--tf_xla_auto_jit=0" DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
export NVIDIA_TF32_OVERRIDE=1 export NVIDIA_TF32_OVERRIDE=1
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
export GLOG_minloglevel=2 export GLOG_minloglevel=2
@@ -127,4 +130,4 @@ print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augmen
print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps" print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps"
python -c $'msg="="*54 ; print(f"{msg:>80s}")' python -c $'msg="="*54 ; print(f"{msg:>80s}")'
print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total" print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total"
python -c $'print(f"{\'=\' * 80}")' python -c $'print(f"{\'=\' * 80}")'