diff --git a/cli/setup_python_venv b/cli/setup_python_venv index 4a77557..1b3c27b 100755 --- a/cli/setup_python_venv +++ b/cli/setup_python_venv @@ -178,6 +178,26 @@ echo " ===== Installing keras =====" # keras 3.13 has "issues" so we need to back down to 3.12. pip_install "keras==3.12.0" +CUDA_DATA_DIR="${DATA_DIR}/cuda" +LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice" +mkdir -p "${LIBDEVICE_DIR}" +TRITON_LIBDEVICE="$( + python - <<'PY' +import glob +import sys + +paths = glob.glob("**/site-packages/triton/backends/nvidia/lib/libdevice.10.bc", recursive=True) +print(paths[0] if paths else "", end="") +PY +)" + +if [ -n "${TRITON_LIBDEVICE}" ] ; then + ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc" + echo " Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}" +else + echo " ⚠️ Triton libdevice.10.bc not found; XLA may require --xla_gpu_cuda_data_dir" +fi + "${PROGDIR}/test_python" --data-dir="${DATA_DIR}" touch .mww-data-dir @@ -185,4 +205,4 @@ END_TS=$EPOCHSECONDS echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell." -print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" \ No newline at end of file +print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" diff --git a/cli/wake_word_sample_trainer b/cli/wake_word_sample_trainer index 433d044..7e172b6 100644 --- a/cli/wake_word_sample_trainer +++ b/cli/wake_word_sample_trainer @@ -204,22 +204,6 @@ TRAIN_ARGS=( --stride 2 ) -GPU_FALLBACK_MARKERS=( - "resourceexhaustederror" - "resource exhausted" - "oom" - "out of memory" - "cuda_error_out_of_memory" - "failed to allocate" - "cudnn" - "cublas" - "internalerror: cuda" - "failed call to cuinit" - "dst tensor is not initialized" - "failed copying input tensor" - "_eagerconst" -) - run_attempt() { local label="$1" shift @@ -239,8 +223,11 @@ run_attempt() { return ${PIPESTATUS[0]} } +DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" +DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" -export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}" +export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}" +export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}" export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}" export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}" export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}" @@ -248,30 +235,14 @@ export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}" if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then echo "✅ Training complete (GPU path)." else - echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…" + echo "⚠️ GPU attempt failed. Falling back to CPU." - log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)" - looks_like_gpu_fail="false" - for m in "${GPU_FALLBACK_MARKERS[@]}"; do - if echo "${log_lc}" | grep -qF "${m}"; then - looks_like_gpu_fail="true" - break - fi - done - - if [ "${looks_like_gpu_fail}" = "true" ]; then - echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU." - - export CUDA_VISIBLE_DEVICES="" - unset TF_GPU_ALLOCATOR - if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then - echo "✅ Training complete (CPU fallback)." - else - echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2 - exit 1 - fi + export CUDA_VISIBLE_DEVICES="" + unset TF_GPU_ALLOCATOR + if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then + echo "✅ Training complete (CPU fallback)." else - echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2 + echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2 exit 1 fi fi @@ -320,4 +291,4 @@ echo "Metadata: ${json_path}" echo END_TS=$EPOCHSECONDS print_elapsed_time "${START_TS}" "${END_TS}" "Training completed." -echo \ No newline at end of file +echo diff --git a/dockerfile b/dockerfile index 5778ead..19590fb 100644 --- a/dockerfile +++ b/dockerfile @@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND=noninteractive # System deps RUN apt-get update && apt-get install -y --no-install-recommends \ python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \ - git wget curl unzip ca-certificates nano less \ + git wget curl unzip ca-certificates nano less nvidia-cuda-toolkit \ && rm -rf /var/lib/apt/lists/* \ && mkdir -p /data diff --git a/train_wake_word b/train_wake_word index 73a9110..517580c 100644 --- a/train_wake_word +++ b/train_wake_word @@ -70,7 +70,10 @@ START_TS=$EPOCHSECONDS export TF_CPP_MIN_LOG_LEVEL=9 export TF_FORCE_GPU_ALLOW_GROWTH=true export TF_GPU_ALLOCATOR=cuda_malloc_async -export TF_XLA_FLAGS="--tf_xla_auto_jit=0" +DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" +DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda" +export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}" +export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}" export NVIDIA_TF32_OVERRIDE=1 export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 export GLOG_minloglevel=2 @@ -127,4 +130,4 @@ print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augmen print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps" python -c $'msg="="*54 ; print(f"{msg:>80s}")' print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total" -python -c $'print(f"{\'=\' * 80}")' \ No newline at end of file +python -c $'print(f"{\'=\' * 80}")'