mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
288 lines
8.3 KiB
Bash
288 lines
8.3 KiB
Bash
#!/bin/bash
|
|
set -e
|
|
|
|
PROGPATH=$(realpath "$0")
|
|
PROGDIR=$(dirname "${PROGPATH}")
|
|
|
|
KNOWN_ARGS=( training-steps samples data-dir )
|
|
source "${PROGDIR}/shell.functions"
|
|
WAKE_WORD="${POSITIONAL_ARGS[0]}"
|
|
|
|
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
|
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
|
HELP=true
|
|
fi
|
|
|
|
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
|
cat <<EOF >&2
|
|
Usage: $0 [ --samples=<samples> ] [ --training-steps=<steps> ]
|
|
<wake_word> [ <wake_word_title> ]
|
|
|
|
$0 -h/--help
|
|
|
|
--samples: The number of samples to generate for the wake word.
|
|
Used only to generate output file names.
|
|
|
|
--training-steps: Number of training steps.
|
|
Default: ${DEFAULT_TRAINING_STEPS}
|
|
|
|
<wake_word>: The word to train spelled phonetically.
|
|
Required.
|
|
|
|
<wake_word_title>: A pretty name to save to the json metadata file.
|
|
Default: The wake word with individual words capitalized.
|
|
|
|
EOF
|
|
exit 1
|
|
fi
|
|
|
|
WORK_DIR="${DATA_DIR}/work"
|
|
TRAINING_DS="${DATA_DIR}/training_datasets"
|
|
|
|
[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}"
|
|
|
|
if [ ! -v WAKE_WORD_TITLE ] ; then
|
|
declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
|
|
WAKE_WORD_TITLE="${WWNA[*]^}"
|
|
elif [ -z "$WAKE_WORD_TITLE" ] ; then
|
|
WAKE_WORD_TITLE="$WAKE_WORD"
|
|
fi
|
|
|
|
# shellcheck source=/dev/null
|
|
source "${DATA_DIR}/.venv/bin/activate"
|
|
|
|
check_directories() {
|
|
for d in "$@" ; do
|
|
[ -d "$d" ] || { echo "ERROR: Directory $d not found" >&2 ; exit 1 ; }
|
|
done
|
|
}
|
|
|
|
check_directories ${WORK_DIR}/wake_word_samples_augmented \
|
|
${TRAINING_DS}/negative_datasets/{speech,dinner_party,no_speech,dinner_party_eval}
|
|
|
|
cd "${WORK_DIR}"
|
|
|
|
echo "===== Starting ${TRAINING_STEPS} training steps ====="
|
|
|
|
START_TS=$EPOCHSECONDS
|
|
|
|
mkdir -p "${WORK_DIR}/trained_models" || :
|
|
cat <<EOF >"${WORK_DIR}/trained_models/training_parameters.yaml"
|
|
batch_size: 16
|
|
clip_duration_ms: 1500
|
|
eval_step_interval: 500
|
|
features:
|
|
- features_dir: ${WORK_DIR}/wake_word_samples_augmented
|
|
penalty_weight: 1.0
|
|
sampling_weight: 2.0
|
|
truncation_strategy: truncate_start
|
|
truth: true
|
|
type: mmap
|
|
- features_dir: ${TRAINING_DS}/negative_datasets/speech
|
|
penalty_weight: 1.0
|
|
sampling_weight: 12.0
|
|
truncation_strategy: random
|
|
truth: false
|
|
type: mmap
|
|
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party
|
|
penalty_weight: 1.0
|
|
sampling_weight: 12.0
|
|
truncation_strategy: random
|
|
truth: false
|
|
type: mmap
|
|
- features_dir: ${TRAINING_DS}/negative_datasets/no_speech
|
|
penalty_weight: 1.0
|
|
sampling_weight: 5.0
|
|
truncation_strategy: random
|
|
truth: false
|
|
type: mmap
|
|
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party_eval
|
|
penalty_weight: 1.0
|
|
sampling_weight: 0.0
|
|
truncation_strategy: split
|
|
truth: false
|
|
type: mmap
|
|
freq_mask_count:
|
|
- 0
|
|
freq_mask_max_size:
|
|
- 0
|
|
learning_rates:
|
|
- 0.001
|
|
maximization_metric: average_viable_recall
|
|
minimization_metric: null
|
|
negative_class_weight:
|
|
- 20
|
|
positive_class_weight:
|
|
- 1
|
|
target_minimization: 0.9
|
|
time_mask_count:
|
|
- 0
|
|
time_mask_max_size:
|
|
- 0
|
|
train_dir: ${WORK_DIR}/trained_models/wakeword
|
|
training_steps:
|
|
- ${TRAINING_STEPS}
|
|
window_step_ms: 10
|
|
|
|
EOF
|
|
|
|
echo " Wrote training_parameters.yaml"
|
|
rm -rf "${WORK_DIR}/trained_models/wakeword"
|
|
|
|
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*$begin:math:text$$end:math:text$\{\}$begin:math:display$$end:math:display$\|\;\'\"<>.?\/]/_}"
|
|
OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}"
|
|
mkdir -p "${OUTPUT_DIR}/logs" || :
|
|
|
|
TRAIN_LOG="${OUTPUT_DIR}/logs/training.log"
|
|
|
|
# ------------------------------------------------------------------
|
|
# Training args (same as before)
|
|
# ------------------------------------------------------------------
|
|
TRAIN_ARGS=(
|
|
-m microwakeword.model_train_eval
|
|
--training_config "${WORK_DIR}/trained_models/training_parameters.yaml"
|
|
--train 1
|
|
--restore_checkpoint 1
|
|
--test_tf_nonstreaming 0
|
|
--test_tflite_nonstreaming 0
|
|
--test_tflite_nonstreaming_quantized 0
|
|
--test_tflite_streaming 0
|
|
--test_tflite_streaming_quantized 1
|
|
--use_weights best_weights
|
|
mixednet
|
|
--pointwise_filters "64,64,64,64"
|
|
--repeat_in_block "1,1,1,1"
|
|
--mixconv_kernel_sizes "[5], [7,11], [9,15], [23]"
|
|
--residual_connection "0,0,0,0"
|
|
--first_conv_filters 32
|
|
--first_conv_kernel_size 5
|
|
--stride 2
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# GPU failure markers that should trigger CPU fallback
|
|
# (OOM + known GPU runtime/copy/init failures)
|
|
# ------------------------------------------------------------------
|
|
GPU_FALLBACK_MARKERS=(
|
|
"resourceexhaustederror"
|
|
"resource exhausted"
|
|
"oom"
|
|
"out of memory"
|
|
"cuda_error_out_of_memory"
|
|
"failed to allocate"
|
|
"cudnn"
|
|
"cublas"
|
|
"internalerror: cuda"
|
|
"failed call to cuinit"
|
|
"dst tensor is not initialized"
|
|
"failed copying input tensor"
|
|
"_eagerconst"
|
|
)
|
|
|
|
run_attempt() {
|
|
local label="$1"
|
|
shift
|
|
echo
|
|
echo "================================================================================"
|
|
echo "===== ${label} ====="
|
|
echo "================================================================================"
|
|
echo "→ ${PYTHON_BIN:-python} ${TRAIN_ARGS[*]}"
|
|
echo
|
|
|
|
# stream everything except validation minibatch spam
|
|
"${PYTHON_BIN:-python}" "${TRAIN_ARGS[@]}" 2>&1 \
|
|
| tr '\r' '\n' \
|
|
| stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" \
|
|
| tee "${TRAIN_LOG}" \
|
|
| sed -r -e "/^Validation Batch/d" -e "s/^INFO:absl:/ /g"
|
|
|
|
return ${PIPESTATUS[0]}
|
|
}
|
|
|
|
# ---- Common TF env (mirrors your notebook) ----
|
|
export TF_CPP_MIN_LOG_LEVEL="${TF_CPP_MIN_LOG_LEVEL:-2}"
|
|
export TF_XLA_FLAGS="${TF_XLA_FLAGS:---tf_xla_auto_jit=0}"
|
|
export NVIDIA_TF32_OVERRIDE="${NVIDIA_TF32_OVERRIDE:-1}"
|
|
export TF_FORCE_GPU_ALLOW_GROWTH="${TF_FORCE_GPU_ALLOW_GROWTH:-true}"
|
|
export TF_GPU_ALLOCATOR="${TF_GPU_ALLOCATOR:-cuda_malloc_async}"
|
|
|
|
# Attempt 1: GPU
|
|
if run_attempt "Attempt 1/2: GPU training (allow_growth + cuda_malloc_async)" ; then
|
|
echo "✅ Training complete (GPU path)."
|
|
else
|
|
echo "⚠️ GPU attempt failed. Checking whether this looks like a GPU/OOM/runtime failure…"
|
|
|
|
# Check log for GPU/OOM/runtime markers
|
|
log_lc="$(tr '[:upper:]' '[:lower:]' < "${TRAIN_LOG}" || true)"
|
|
looks_like_gpu_fail="false"
|
|
for m in "${GPU_FALLBACK_MARKERS[@]}"; do
|
|
if echo "${log_lc}" | grep -qF "${m}"; then
|
|
looks_like_gpu_fail="true"
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ "${looks_like_gpu_fail}" = "true" ]; then
|
|
echo "↪️ Detected GPU/OOM/runtime failure markers. Falling back to CPU."
|
|
|
|
# Attempt 2: CPU (hide GPU completely)
|
|
export CUDA_VISIBLE_DEVICES=""
|
|
unset TF_GPU_ALLOCATOR
|
|
if run_attempt "Attempt 2/2: CPU fallback (CUDA_VISIBLE_DEVICES='')" ; then
|
|
echo "✅ Training complete (CPU fallback)."
|
|
else
|
|
echo "❌ Training failed on BOTH GPU and CPU. See: ${TRAIN_LOG}" >&2
|
|
exit 1
|
|
fi
|
|
else
|
|
echo "❌ Training failed (does not look GPU/OOM/runtime). See: ${TRAIN_LOG}" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite"
|
|
|
|
if [ ! -f "${source_path}" ] ; then
|
|
echo "Output model not found! Training didn't complete successfully. See ${TRAIN_LOG}"
|
|
exit 1
|
|
fi
|
|
|
|
cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/" || :
|
|
cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/" || :
|
|
cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/" || :
|
|
|
|
echo -e "\n Training complete!"
|
|
echo " Full log: ${TRAIN_LOG}"
|
|
|
|
tflite_filename="${wake_word_filename}.tflite"
|
|
tflite_path="${OUTPUT_DIR}/${tflite_filename}"
|
|
|
|
cp "${source_path}" "${tflite_path}"
|
|
|
|
json_path="${OUTPUT_DIR}/${wake_word_filename}.json"
|
|
cat <<-EOF > "${json_path}"
|
|
{
|
|
"type": "micro",
|
|
"wake_word": "${WAKE_WORD_TITLE}",
|
|
"author": "Tater Totterson",
|
|
"website": "https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git",
|
|
"model": "${tflite_filename}",
|
|
"trained_languages": ["en"],
|
|
"version": 2,
|
|
"micro": {
|
|
"probability_cutoff": 0.97,
|
|
"sliding_window_size": 5,
|
|
"feature_step_size": 10,
|
|
"tensor_arena_size": 30000,
|
|
"minimum_esphome_version": "2024.7.0"
|
|
}
|
|
}
|
|
EOF
|
|
|
|
echo "Name: ${WAKE_WORD_TITLE}"
|
|
echo "Model: ${tflite_path}"
|
|
echo "Metadata: ${json_path}"
|
|
echo
|
|
END_TS=$EPOCHSECONDS
|
|
print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
|
|
echo |