diff --git a/cli/setup_python_venv b/cli/setup_python_venv
index 4a77557..1b3c27b 100755
--- a/cli/setup_python_venv
+++ b/cli/setup_python_venv
@@ -178,6 +178,26 @@ echo "   ===== Installing keras ====="
 # keras 3.13 has "issues" so we need to back down to 3.12.
 pip_install "keras==3.12.0"
 
+CUDA_DATA_DIR="${DATA_DIR}/cuda"
+LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice"
+mkdir -p "${LIBDEVICE_DIR}"
+TRITON_LIBDEVICE="$(
+    python - <<'PY'
+import glob
+import sys
+
+paths = glob.glob("**/site-packages/triton/backends/nvidia/lib/libdevice.10.bc", recursive=True)
+print(paths[0] if paths else "", end="")
+PY
+)"
+
+if [ -n "${TRITON_LIBDEVICE}" ] ; then
+    ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc"
+    echo "   Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}"
+else
+    echo "   ⚠️  Triton libdevice.10.bc not found; XLA may require --xla_gpu_cuda_data_dir"
+fi
+
 "${PROGDIR}/test_python" --data-dir="${DATA_DIR}"
 
 touch .mww-data-dir
@@ -185,4 +205,4 @@ END_TS=$EPOCHSECONDS
 
 echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
 
-print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
\ No newline at end of file
+print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
diff --git a/cli/wake_word_sample_trainer b/cli/wake_word_sample_trainer
index 433d044..7e172b6 100644
--- a/cli/wake_word_sample_trainer
+++ b/cli/wake_word_sample_trainer
@@ -204,22 +204,6 @@ TRAIN_ARGS=(
   --stride 2
 )
 
-GPU_FALLBACK_MARKERS=(
-  "resourceexhaustederror"
-  "resource exhausted"
-  "oom"
-  "out of memory"
-  "cuda_error_out_of_memory"
-  "failed to allocate"
-  "cudnn"
-  "cublas"
-  "internalerror: cuda"
-  "failed call to cuinit"
-  "dst tensor is not initialized"
-  "failed copying input tensor"
-  "_eagerconst"
-)
-
 run_attempt() {
   local label="$1"
   shift
@@ -239,8 +223,11 @@ run_attempt() {
   return ${PIPESTATUS[0]}
 }
 
+DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
+DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
 export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
-export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
+export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
+export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
 export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
 export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
 export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
@@ -248,30 +235,14 @@ export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
 if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
   echo "✅ Training complete (GPU path)."
 else
-  echo "⚠️  GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
+  echo "⚠️  GPU attempt failed. Falling back to CPU."
 
-  log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
-  looks_like_gpu_fail="false"
-  for m in "${GPU_FALLBACK_MARKERS[@]}"; do
-    if echo "${log_lc}" | grep -qF "${m}"; then
-      looks_like_gpu_fail="true"
-      break
-    fi
-  done
-
-  if [ "${looks_like_gpu_fail}" = "true" ]; then
-    echo "↪️  Detected GPU/OOM/runtime failure markers. Falling back to CPU."
-
-    export CUDA_VISIBLE_DEVICES=""
-    unset TF_GPU_ALLOCATOR
-    if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
-      echo "✅ Training complete (CPU fallback)."
-    else
-      echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
-      exit 1
-    fi
+  export CUDA_VISIBLE_DEVICES=""
+  unset TF_GPU_ALLOCATOR
+  if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
+    echo "✅ Training complete (CPU fallback)."
   else
-    echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
+    echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
     exit 1
   fi
 fi
@@ -320,4 +291,4 @@ echo "Metadata: ${json_path}"
 echo
 END_TS=$EPOCHSECONDS
 print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
-echo
\ No newline at end of file
+echo
diff --git a/dockerfile b/dockerfile
index 5778ead..19590fb 100644
--- a/dockerfile
+++ b/dockerfile
@@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 # System deps
 RUN apt-get update && apt-get install -y --no-install-recommends \
     python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
-    git wget curl unzip ca-certificates nano less \
+    git wget curl unzip ca-certificates nano less nvidia-cuda-toolkit \
  && rm -rf /var/lib/apt/lists/* \
  && mkdir -p /data
 
diff --git a/train_wake_word b/train_wake_word
index 73a9110..517580c 100644
--- a/train_wake_word
+++ b/train_wake_word
@@ -70,7 +70,10 @@ START_TS=$EPOCHSECONDS
 export TF_CPP_MIN_LOG_LEVEL=9
 export TF_FORCE_GPU_ALLOW_GROWTH=true
 export TF_GPU_ALLOCATOR=cuda_malloc_async
-export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
+DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
+DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
+export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
+export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
 export NVIDIA_TF32_OVERRIDE=1
 export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
 export GLOG_minloglevel=2
@@ -127,4 +130,4 @@ print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augmen
 print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps"
 python -c $'msg="="*54 ; print(f"{msg:>80s}")'
 print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total"
-python -c $'print(f"{\'=\' * 80}")'
\ No newline at end of file
+python -c $'print(f"{\'=\' * 80}")'