diff --git a/cli/setup_python_venv b/cli/setup_python_venv index 1b3c27b..8d82211 100755 --- a/cli/setup_python_venv +++ b/cli/setup_python_venv @@ -178,24 +178,31 @@ echo " ===== Installing keras =====" # keras 3.13 has "issues" so we need to back down to 3.12. pip_install "keras==3.12.0" -CUDA_DATA_DIR="${DATA_DIR}/cuda" -LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice" -mkdir -p "${LIBDEVICE_DIR}" -TRITON_LIBDEVICE="$( - python - <<'PY' -import glob -import sys +# ----------------------------------------------------------------------------- +# Optional CUDA data dir (GPU-only) +# Some stacks expect a CUDA "nvvm/libdevice" tree. We create one in /data/cuda +# and link Triton's libdevice if it exists. This is safe and does NOT enable +# any extra XLA flags by itself. +# ----------------------------------------------------------------------------- +if ${GPU} ; then + CUDA_DATA_DIR="${DATA_DIR}/cuda" + LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice" + mkdir -p "${LIBDEVICE_DIR}" + TRITON_LIBDEVICE="$( + python - <<'PY' +import glob paths = glob.glob("**/site-packages/triton/backends/nvidia/lib/libdevice.10.bc", recursive=True) print(paths[0] if paths else "", end="") PY -)" + )" -if [ -n "${TRITON_LIBDEVICE}" ] ; then - ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc" - echo " Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}" -else - echo " ⚠️ Triton libdevice.10.bc not found; XLA may require --xla_gpu_cuda_data_dir" + if [ -n "${TRITON_LIBDEVICE}" ] ; then + ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc" + echo " Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}" + else + echo " ℹ️ Triton libdevice.10.bc not found (ok)" + fi fi "${PROGDIR}/test_python" --data-dir="${DATA_DIR}" @@ -205,4 +212,4 @@ END_TS=$EPOCHSECONDS echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell." -print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" +print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" \ No newline at end of file diff --git a/train_wake_word b/train_wake_word index 517580c..5546603 100644 --- a/train_wake_word +++ b/train_wake_word @@ -67,17 +67,23 @@ echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation echo START_TS=$EPOCHSECONDS +# ----------------------------------------------------------------------------- +# TensorFlow / XLA environment (known-good, portable) +# ----------------------------------------------------------------------------- export TF_CPP_MIN_LOG_LEVEL=9 export TF_FORCE_GPU_ALLOW_GROWTH=true export TF_GPU_ALLOCATOR=cuda_malloc_async -DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" -DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" -export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}" -export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}" + +# Hard-set TF_XLA_FLAGS to ONLY what we know this build supports. +# Do NOT append user environment flags (can cause hard failures). +export TF_XLA_FLAGS="--tf_xla_auto_jit=0" +unset XLA_FLAGS + export NVIDIA_TF32_OVERRIDE=1 export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 export GLOG_minloglevel=2 export GRPC_VERBOSITY=ERROR +# ----------------------------------------------------------------------------- "${CLIDIR}/wake_word_sample_generator" \ --samples=${SAMPLES} \ @@ -130,4 +136,4 @@ print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augmen print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps" python -c $'msg="="*54 ; print(f"{msg:>80s}")' print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total" -python -c $'print(f"{\'=\' * 80}")' +python -c $'print(f"{\'=\' * 80}")' \ No newline at end of file