mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
tweaks
This commit is contained in:
@@ -139,7 +139,7 @@ training_steps:
|
|||||||
window_step_ms: 10
|
window_step_ms: 10
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Replace placeholders (portable)
|
# Replace placeholders
|
||||||
sed -i \
|
sed -i \
|
||||||
-e "s|__WAKEWORD_FEATURES__|${WORK_DIR}/wake_word_samples_augmented|g" \
|
-e "s|__WAKEWORD_FEATURES__|${WORK_DIR}/wake_word_samples_augmented|g" \
|
||||||
-e "s|__NEG_SPEECH__|${TRAINING_DS}/negative_datasets/speech|g" \
|
-e "s|__NEG_SPEECH__|${TRAINING_DS}/negative_datasets/speech|g" \
|
||||||
@@ -152,7 +152,7 @@ sed -i \
|
|||||||
|
|
||||||
# Insert/remove personal block
|
# Insert/remove personal block
|
||||||
if [ "${HAS_PERSONAL}" = "true" ]; then
|
if [ "${HAS_PERSONAL}" = "true" ]; then
|
||||||
# Insert directly after the wakeword feature block (matches notebook: insert(1, ...))
|
# Insert directly after the wakeword feature block
|
||||||
personal_block="$(cat <<EOF
|
personal_block="$(cat <<EOF
|
||||||
- features_dir: ${PERSONAL_FEATURES_DIR}
|
- features_dir: ${PERSONAL_FEATURES_DIR}
|
||||||
penalty_weight: 1.0
|
penalty_weight: 1.0
|
||||||
@@ -165,7 +165,6 @@ EOF
|
|||||||
|
|
||||||
perl -0777 -i -pe "s#__PERSONAL_FEATURE_MARKER__#${personal_block}#g" "${YAML_PATH}"
|
perl -0777 -i -pe "s#__PERSONAL_FEATURE_MARKER__#${personal_block}#g" "${YAML_PATH}"
|
||||||
else
|
else
|
||||||
# Remove marker line entirely
|
|
||||||
sed -i -e "/__PERSONAL_FEATURE_MARKER__/d" "${YAML_PATH}"
|
sed -i -e "/__PERSONAL_FEATURE_MARKER__/d" "${YAML_PATH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -204,6 +203,22 @@ TRAIN_ARGS=(
|
|||||||
--stride 2
|
--stride 2
|
||||||
)
|
)
|
||||||
|
|
||||||
|
GPU_FALLBACK_MARKERS=(
|
||||||
|
"resourceexhaustederror"
|
||||||
|
"resource exhausted"
|
||||||
|
"oom"
|
||||||
|
"out of memory"
|
||||||
|
"cuda_error_out_of_memory"
|
||||||
|
"failed to allocate"
|
||||||
|
"cudnn"
|
||||||
|
"cublas"
|
||||||
|
"internalerror: cuda"
|
||||||
|
"failed call to cuinit"
|
||||||
|
"dst tensor is not initialized"
|
||||||
|
"failed copying input tensor"
|
||||||
|
"_eagerconst"
|
||||||
|
)
|
||||||
|
|
||||||
run_attempt() {
|
run_attempt() {
|
||||||
local label="$1"
|
local label="$1"
|
||||||
shift
|
shift
|
||||||
@@ -223,11 +238,11 @@ run_attempt() {
|
|||||||
return ${PIPESTATUS[0]}
|
return ${PIPESTATUS[0]}
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_XLA_FLAGS="--tf_xla_auto_jit=0 --xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
|
# --------- ENV (keep compatible; DO NOT add unsupported XLA flags) ----------
|
||||||
DEFAULT_XLA_RUNTIME_FLAGS="--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found --xla_gpu_cuda_data_dir=${DATA_DIR}/cuda"
|
|
||||||
export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
|
export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
|
||||||
export TF_XLA_FLAGS="${TF_XLA_FLAGS:+${TF_XLA_FLAGS} }${DEFAULT_XLA_FLAGS}"
|
export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
|
||||||
export XLA_FLAGS="${XLA_FLAGS:+${XLA_FLAGS} }${DEFAULT_XLA_RUNTIME_FLAGS}"
|
unset XLA_FLAGS
|
||||||
|
|
||||||
export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
|
export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
|
||||||
export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
|
export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
|
||||||
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
|
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
|
||||||
@@ -235,14 +250,35 @@ export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
|
|||||||
if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
|
if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
|
||||||
echo "✅ Training complete (GPU path)."
|
echo "✅ Training complete (GPU path)."
|
||||||
else
|
else
|
||||||
echo "⚠️ GPU attempt failed. Falling back to CPU."
|
echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
|
||||||
|
|
||||||
export CUDA_VISIBLE_DEVICES=""
|
log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
|
||||||
unset TF_GPU_ALLOCATOR
|
looks_like_gpu_fail="false"
|
||||||
if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
|
for m in "${GPU_FALLBACK_MARKERS[@]}"; do
|
||||||
echo "✅ Training complete (CPU fallback)."
|
if echo "${log_lc}" | grep -qF "${m}"; then
|
||||||
|
looks_like_gpu_fail="true"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${looks_like_gpu_fail}" = "true" ]; then
|
||||||
|
echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU."
|
||||||
|
|
||||||
|
export CUDA_VISIBLE_DEVICES=""
|
||||||
|
unset TF_GPU_ALLOCATOR
|
||||||
|
|
||||||
|
# CPU attempt should not inherit GPU/XLA runtime knobs
|
||||||
|
unset TF_XLA_FLAGS
|
||||||
|
unset XLA_FLAGS
|
||||||
|
|
||||||
|
if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
|
||||||
|
echo "✅ Training complete (CPU fallback)."
|
||||||
|
else
|
||||||
|
echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
|
echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user