tweaks

2026-06-12 20:10:19 -06:00 · 2026-01-21 06:03:57 -06:00
parent cc05f94392
commit 423bbd15f5
1 changed files with 50 additions and 14 deletions
--- a/cli/wake_word_sample_trainer
+++ b/cli/wake_word_sample_trainer
@@ -139,7 +139,7 @@ training_steps:
 window_step_ms: 10
 EOF
-# Replace placeholders (portable)
+# Replace placeholders
 sed -i \
  -e "s|__WAKEWORD_FEATURES__|${WORK_DIR}/wake_word_samples_augmented|g" \
  -e "s|__NEG_SPEECH__|${TRAINING_DS}/negative_datasets/speech|g" \
@@ -152,7 +152,7 @@ sed -i \
 # Insert/remove personal block
 if [ "${HAS_PERSONAL}" = "true" ]; then
-  # Insert directly after the wakeword feature block (matches notebook: insert(1, ...))
+  # Insert directly after the wakeword feature block
  personal_block="$(cat <<EOF
 - features_dir: ${PERSONAL_FEATURES_DIR}
  penalty_weight: 1.0
@@ -165,7 +165,6 @@ EOF
  perl -0777 -i -pe "s#__PERSONAL_FEATURE_MARKER__#${personal_block}#g" "${YAML_PATH}"
 else
  # Remove marker line entirely
  sed -i -e "/__PERSONAL_FEATURE_MARKER__/d" "${YAML_PATH}"
 fi
@@ -204,6 +203,22 @@ TRAIN_ARGS=(
  --stride 2
 )
 GPU_FALLBACK_MARKERS=(
  "resourceexhaustederror"
  "resource exhausted"
  "oom"
  "out of memory"
  "cuda_error_out_of_memory"
  "failed to allocate"
  "cudnn"
  "cublas"
  "internalerror: cuda"
  "failed call to cuinit"
  "dst tensor is not initialized"
  "failed copying input tensor"
  "_eagerconst"
 )
 run_attempt() {
  local label="$1"
  shift
@@ -223,11 +238,11 @@ run_attempt() {
  return ${PIPESTATUS[0]}
 }
-DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
+# --------- ENV (keep compatible; DO NOT add unsupported XLA flags) ----------
 DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
 export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
-export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
+export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
-export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
+unset XLA_FLAGS
 export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
 export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
 export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
@@ -235,14 +250,35 @@ export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
 if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
  echo "✅ Training complete (GPU path)."
 else
-  echo "⚠️  GPU attempt failed. Falling back to CPU."
+  echo "⚠️  GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
-  export CUDA_VISIBLE_DEVICES=""
+  log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
-  unset TF_GPU_ALLOCATOR
+  looks_like_gpu_fail="false"
-  if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
+  for m in "${GPU_FALLBACK_MARKERS[@]}"; do
-    echo "✅ Training complete (CPU fallback)."
+    if echo "${log_lc}" | grep -qF "${m}"; then
      looks_like_gpu_fail="true"
      break
    fi
  done
  if [ "${looks_like_gpu_fail}" = "true" ]; then
    echo "↪️  Detected GPU/OOM/runtime failure markers. Falling back to CPU."
    export CUDA_VISIBLE_DEVICES=""
    unset TF_GPU_ALLOCATOR
    # CPU attempt should not inherit GPU/XLA runtime knobs
    unset TF_XLA_FLAGS
    unset XLA_FLAGS
    if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
      echo "✅ Training complete (CPU fallback)."
    else
      echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
      exit 1
    fi
  else
-    echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
+    echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
    exit 1
  fi
 fi