This commit is contained in:
MasterPhooey
2026-03-10 08:05:36 -05:00
parent 94903783cb
commit 7b92de81f9
3 changed files with 21 additions and 10 deletions

View File

@@ -72,7 +72,14 @@ if ${GPU} ; then
fi
VENV="${DATA_DIR}/.venv"
[ -n "${VIRTUAL_ENV}" ] && deactivate
if [ -n "${VIRTUAL_ENV:-}" ] && [ "${VIRTUAL_ENV}" != "${VENV}" ] ; then
if command -v deactivate >/dev/null 2>&1 ; then
deactivate || :
else
# Recorder process can inherit VIRTUAL_ENV without the shell function.
unset VIRTUAL_ENV
fi
fi
if [ -n "${PYTHON}" ] ; then
PYTHONS=( "${PYTHON}" )

View File

@@ -82,14 +82,14 @@ ALLOW_CPU_FALLBACK="$(normalize_bool "${MWW_ALLOW_CPU_FALLBACK:-${ALLOW_CPU_FALL
if [ "${IS_BLACKWELL}" = "true" ]; then
echo " Blackwell GPU detected (compute capability ${GPU_COMPUTE_CAPABILITY})."
echo " Using GPU compatibility retries; CPU fallback is ${ALLOW_CPU_FALLBACK} (override with MWW_ALLOW_CPU_FALLBACK=true|false)."
fi
# Force driver PTX fallback when XLA needs ptxas.
if [ -z "${XLA_FLAGS:-}" ]; then
export XLA_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found"
echo " Setting XLA_FLAGS=${XLA_FLAGS}"
else
echo " Using user-provided XLA_FLAGS=${XLA_FLAGS}"
fi
# Enable driver-side PTX JIT fallback when ptxas/nvlink are unavailable.
if [ -z "${XLA_FLAGS:-}" ]; then
export XLA_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found"
echo " Setting XLA_FLAGS=${XLA_FLAGS}"
else
echo " Using user-provided XLA_FLAGS=${XLA_FLAGS}"
fi
check_directories() {
@@ -251,6 +251,9 @@ GPU_FALLBACK_MARKERS=(
"cuda_error_out_of_memory"
"cuda_error_invalid_handle"
"culaunchkernel"
"no ptx compilation provider is available"
"couldn't find a suitable version of ptxas"
"couldn't find a suitable version of nvlink"
"failed to allocate"
"cudnn"
"cublas"

View File

@@ -98,13 +98,14 @@ export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
if ${IS_BLACKWELL} ; then
# TF 2.20 + Blackwell is often unstable with cuda_malloc_async.
unset TF_GPU_ALLOCATOR
[ -z "${XLA_FLAGS:-}" ] && export XLA_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found"
echo " Blackwell detected (compute capability ${GPU_COMPUTE_CAPABILITY}): using compatibility GPU defaults."
else
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
unset XLA_FLAGS
fi
# Enable driver-side PTX JIT fallback when ptxas/nvlink are unavailable.
[ -z "${XLA_FLAGS:-}" ] && export XLA_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found"
export NVIDIA_TF32_OVERRIDE=1
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
export GLOG_minloglevel=2