#!/bin/bash set -e PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") KNOWN_ARGS=( training-steps samples data-dir ) source "${PROGDIR}/shell.functions" WAKE_WORD="${POSITIONAL_ARGS[0]}" if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2 HELP=true fi if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then cat <&2 Usage: $0 [ --samples= ] [ --training-steps= ] [ ] $0 -h/--help --samples: The number of samples to generate for the wake word. Used only to generate output file names. --training-steps: Number of training steps. Default: ${DEFAULT_TRAINING_STEPS} : The word to train spelled phonetically. Required. : A pretty name to save to the json metadata file. Default: The wake word with individual words capitalized. EOF exit 1 fi WORK_DIR="${DATA_DIR}/work" TRAINING_DS="${DATA_DIR}/training_datasets" [ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" if [ ! -v WAKE_WORD_TITLE ] ; then declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } ) WAKE_WORD_TITLE="${WWNA[*]^}" elif [ -z "$WAKE_WORD_TITLE" ] ; then WAKE_WORD_TITLE="$WAKE_WORD" fi # shellcheck source=/dev/null source "${DATA_DIR}/.venv/bin/activate" check_directories() { for d in "$@" ; do [ -d "$d" ] || { echo "ERROR: Directory $d not found" >&2 ; exit 1 ; } done } check_directories ${WORK_DIR}/wake_word_samples_augmented \ ${TRAINING_DS}/negative_datasets/{speech,dinner_party,no_speech,dinner_party_eval} cd "${WORK_DIR}" echo "===== Starting ${TRAINING_STEPS} training steps =====" START_TS=$EPOCHSECONDS mkdir -p "${WORK_DIR}/trained_models" || : cat <"${WORK_DIR}/trained_models/training_parameters.yaml" batch_size: 16 clip_duration_ms: 1500 eval_step_interval: 500 features: - features_dir: ${WORK_DIR}/wake_word_samples_augmented penalty_weight: 1.0 sampling_weight: 2.0 truncation_strategy: truncate_start truth: true type: mmap - features_dir: ${TRAINING_DS}/negative_datasets/speech penalty_weight: 1.0 sampling_weight: 12.0 truncation_strategy: random truth: false type: mmap - features_dir: ${TRAINING_DS}/negative_datasets/dinner_party penalty_weight: 1.0 sampling_weight: 12.0 truncation_strategy: random truth: false type: mmap - features_dir: ${TRAINING_DS}/negative_datasets/no_speech penalty_weight: 1.0 sampling_weight: 5.0 truncation_strategy: random truth: false type: mmap - features_dir: ${TRAINING_DS}/negative_datasets/dinner_party_eval penalty_weight: 1.0 sampling_weight: 0.0 truncation_strategy: split truth: false type: mmap freq_mask_count: - 0 freq_mask_max_size: - 0 learning_rates: - 0.001 maximization_metric: average_viable_recall minimization_metric: null negative_class_weight: - 20 positive_class_weight: - 1 target_minimization: 0.9 time_mask_count: - 0 time_mask_max_size: - 0 train_dir: ${WORK_DIR}/trained_models/wakeword training_steps: - ${TRAINING_STEPS} window_step_ms: 10 EOF echo " Wrote training_parameters.yaml" rm -rf "${WORK_DIR}/trained_models/wakeword" wake_word_filename="${WAKE_WORD//[ \`~\!\$&*$begin:math:text$$end:math:text$\{\}$begin:math:display$$end:math:display$\|\;\'\"<>.?\/]/_}" OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}" mkdir -p "${OUTPUT_DIR}/logs" || : TRAIN_LOG="${OUTPUT_DIR}/logs/training.log" # ------------------------------------------------------------------ # Training args (same as before) # ------------------------------------------------------------------ TRAIN_ARGS=( -m microwakeword.model_train_eval --training_config "${WORK_DIR}/trained_models/training_parameters.yaml" --train 1 --restore_checkpoint 1 --test_tf_nonstreaming 0 --test_tflite_nonstreaming 0 --test_tflite_nonstreaming_quantized 0 --test_tflite_streaming 0 --test_tflite_streaming_quantized 1 --use_weights best_weights mixednet --pointwise_filters "64,64,64,64" --repeat_in_block "1,1,1,1" --mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" --residual_connection "0,0,0,0" --first_conv_filters 32 --first_conv_kernel_size 5 --stride 2 ) # ------------------------------------------------------------------ # GPU failure markers that should trigger CPU fallback # (OOM + known GPU runtime/copy/init failures) # ------------------------------------------------------------------ GPU_FALLBACK_MARKERS=( "resourceexhaustederror" "resource exhausted" "oom" "out of memory" "cuda_error_out_of_memory" "failed to allocate" "cudnn" "cublas" "internalerror: cuda" "failed call to cuinit" "dst tensor is not initialized" "failed copying input tensor" "_eagerconst" ) run_attempt() { local label="$1" shift echo echo "================================================================================" echo "===== ${label} =====" echo "================================================================================" echo "→ ${PYTHON_BIN:-python} ${TRAIN_ARGS[*]}" echo # stream everything except validation minibatch spam "${PYTHON_BIN:-python}" "${TRAIN_ARGS[@]}" 2>&1 \ | tr '\r' '\n' \ | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" \ | tee "${TRAIN_LOG}" \ | sed -r -e "/^Validation Batch/d" -e "s/^INFO:absl:/ /g" return ${PIPESTATUS[0]} } # ---- Common TF env (mirrors your notebook) ---- export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}" export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}" export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}" export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}" export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}" # Attempt 1: GPU if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then echo "✅ Training complete (GPU path)." else echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…" # Check log for GPU/OOM/runtime markers log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)" looks_like_gpu_fail="false" for m in "${GPU_FALLBACK_MARKERS[@]}"; do if echo "${log_lc}" | grep -qF "${m}"; then looks_like_gpu_fail="true" break fi done if [ "${looks_like_gpu_fail}" = "true" ]; then echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU." # Attempt 2: CPU (hide GPU completely) export CUDA_VISIBLE_DEVICES="" unset TF_GPU_ALLOCATOR if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then echo "✅ Training complete (CPU fallback)." else echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2 exit 1 fi else echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2 exit 1 fi fi source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite" if [ ! -f "${source_path}" ] ; then echo "Output model not found! Training didn't complete successfully. See ${TRAIN_LOG}" exit 1 fi cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/" || : cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/" || : cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/" || : echo -e "\n Training complete!" echo " Full log: ${TRAIN_LOG}" tflite_filename="${wake_word_filename}.tflite" tflite_path="${OUTPUT_DIR}/${tflite_filename}" cp "${source_path}" "${tflite_path}" json_path="${OUTPUT_DIR}/${wake_word_filename}.json" cat <<-EOF > "${json_path}" { "type": "micro", "wake_word": "${WAKE_WORD_TITLE}", "author": "Tater Totterson", "website": "https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git", "model": "${tflite_filename}", "trained_languages": ["en"], "version": 2, "micro": { "probability_cutoff": 0.97, "sliding_window_size": 5, "feature_step_size": 10, "tensor_arena_size": 30000, "minimum_esphome_version": "2024.7.0" } } EOF echo "Name: ${WAKE_WORD_TITLE}" echo "Model: ${tflite_path}" echo "Metadata: ${json_path}" echo END_TS=$EPOCHSECONDS print_elapsed_time "${START_TS}" "${END_TS}" "Training completed." echo