microWakeWord-Trainer-Nvidi…/cli/setup_python_venv

#!/bin/bash
PROGDIR="$(dirname "$(realpath "$0")")"
ROOTDIR="$(dirname "${PROGDIR}")"

KNOWN_ARGS=( data-dir python gpu no-gpu )
source "${PROGDIR}/shell.functions"

if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
    echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
    HELP=true
fi

if [ "${HELP}" == "true" ] ; then
    cat <<EOF >&2
Usage: setup_python_venv [ --gpu | --no-gpu ] [ --verbose ]

Options:
--gpu:     Install the GPU-capable versions of packages if available.  This
           is the default if the script detects that a GPU is available.

--no-gpu:  Install the non-GPU-capable versions of packages even if
           GPU-capable packages are available.  This is the default if the script
           detects that a GPU is NOT available.

--verbose: Print the detailed "pip install" output.

Environment overrides:
MWW_TF_SPEC:           Full TensorFlow package spec (e.g. "tf-nightly[and-cuda]"
                       or "tensorflow[and-cuda]==2.20.0").
MWW_TENSORBOARD_SPEC:  Comma-separated TensorBoard package specs.
                       Example: "tensorboard==2.20.0,tensorboard-data-server==0.7.2"
MWW_KERAS_SPEC:        Keras package spec to install explicitly.

EOF
    exit 1
fi

[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
[ -d  "${DATA_DIR}" ] || {
    echo "Data directory '${DATA_DIR}' doesn't exist." >&2
    exit 1
}

cd "${DATA_DIR}"

[ -z "${GPU}" ] && {
    GPU=false
    [ -c /dev/nvidiactl ] && {
        GPU=true
        echo "   Nvidia GPU detected"
    }
}

"${GPU}" || export CUDA_VISIBLE_DEVICES=-1

detect_gpu_compute_capability() {
    if command -v nvidia-smi >/dev/null 2>&1 ; then
        nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null \
            | head -n 1 \
            | tr -d '[:space:]'
    fi
}

GPU_COMPUTE_CAPABILITY=""
IS_BLACKWELL=false
if ${GPU} ; then
    GPU_COMPUTE_CAPABILITY="$(detect_gpu_compute_capability || true)"
    case "${GPU_COMPUTE_CAPABILITY}" in
        12.*) IS_BLACKWELL=true ;;
    esac
    ${IS_BLACKWELL} && echo "   Blackwell GPU detected (compute capability ${GPU_COMPUTE_CAPABILITY})"
fi

VENV="${DATA_DIR}/.venv"
if [ -n "${VIRTUAL_ENV:-}" ] && [ "${VIRTUAL_ENV}" != "${VENV}" ] ; then
    if command -v deactivate >/dev/null 2>&1 ; then
        deactivate || :
    else
        # Recorder process can inherit VIRTUAL_ENV without the shell function.
        unset VIRTUAL_ENV
    fi
fi

if [ -n "${PYTHON}" ] ; then
    PYTHONS=( "${PYTHON}" )
    unset PYTHON
else
    # Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04)
    PYTHONS=( python3.12 python3.11 python3.10 )
fi

for p in "${PYTHONS[@]}" ; do
    "${p}" --version &>/dev/null && { PYTHON="${p}" ; break ; }
done

[ -n "${PYTHON}" ] || {
    echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2
    exit 1
}

if [ -d "${VENV}" ] ; then
    if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
        source "${VENV}/bin/activate" || {
            echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
            exit 1
        }
    else
        rm -rf "${VENV}"
    fi
fi

echo "===== Setting up Python environment ${VENV} ====="

if [ -z "$VIRTUAL_ENV" ] ; then
    echo "   ===== Creating new virtualenv at '${VENV}' ====="
else
    echo "   ===== Updating virtualenv at '${VENV}' ====="
fi

${PYTHON} -m venv --upgrade-deps "${VENV}"
source "${VENV}/bin/activate"

set -euo pipefail

# Symlink CLI scripts into .venv/bin
declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) )
progfiles+=( "${PROGDIR}/shell.functions" )

# Also symlink the top-level entrypoint if present
[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" )

for f in "${progfiles[@]}" ; do
    ln -sfr "${f}" ".venv/bin/$(basename "${f}")"
done

#
# Pip doesn't process packages from requirements.txt in order but order is
# important because tensorflow, torch, onnxruntime and micro-wake-word all
# depend on CUDA packages at various versions. They need to be installed in
# this specific order or they may not be able to use the GPU.
#
export PIP_PROGRESS_BAR=off
export PIP_NO_COLOR=1
export PIP_QUIET=0

pip_install() {
    if $VERBOSE ; then
        pip install "$@" || return 1
    else
        { pip install "$@" || return 1 ; } | stdbuf -i0 -o0 tr -d '[:print:]' | stdbuf -i0 -o0 tr '\n' '.'
    fi
    echo
}

START_TS=$EPOCHSECONDS

echo "   ===== Installing common requirements ====="
# requirements.txt lives in repo root now
pip_install -r "${ROOTDIR}/requirements.txt"

${GPU} && tfgpu='[and-cuda]' || tfgpu=""
declare -a default_tensorboard_specs=()

if ${GPU} && ${IS_BLACKWELL} ; then
    # Blackwell path: prefer nightly TF while upstream stable wheels catch up.
    DEFAULT_TF_SPEC="tf-nightly${tfgpu}"
    # Let tf-nightly resolve a compatible TensorBoard dependency by default.
    default_tensorboard_specs=()
else
    DEFAULT_TF_SPEC="tensorflow${tfgpu}==2.20.0"
    default_tensorboard_specs=( "tensorboard==2.20.0" "tensorboard-data-server==0.7.2" )
fi

TF_SPEC="${MWW_TF_SPEC:-${DEFAULT_TF_SPEC}}"
declare -a tf_install_specs=( ai_edge_litert "${TF_SPEC}" )

if [ -n "${MWW_TENSORBOARD_SPEC:-}" ] ; then
    IFS=',' read -r -a user_tb_specs <<< "${MWW_TENSORBOARD_SPEC}"
    for tb_spec in "${user_tb_specs[@]}" ; do
        tb_spec="${tb_spec#"${tb_spec%%[![:space:]]*}"}"
        tb_spec="${tb_spec%"${tb_spec##*[![:space:]]}"}"
        [ -n "${tb_spec}" ] && tf_install_specs+=( "${tb_spec}" )
    done
else
    tf_install_specs+=( "${default_tensorboard_specs[@]}" )
fi

echo "   ===== Installing TensorFlow stack (${TF_SPEC}) ====="
pip_install "${tf_install_specs[@]}"

${GPU} && torchgpu='--index-url https://download.pytorch.org/whl/cu129' || torchgpu=""
echo "   ===== Installing torch and torchaudio ${torchgpu:+[cuda]} ====="
pip_install "torch==2.9.1" "torchaudio==2.9.1" ${torchgpu}

echo "   ===== Checking microwakeword ====="
MWW="${DATA_DIR}/tools/microWakeWord"
if [ ! -d "${MWW}" ] || [ -n "$(git -C "${MWW}" status --porcelain)" ] ; then
    rm -rf "${MWW}" || :
    echo "   Cloning micro-wake-word to ${DATA_DIR}/tools"
    git clone https://github.com/TaterTotterson/micro-wake-word "${MWW}" &>/dev/null
fi
echo "   Installing microwakeword"
pip_install -e "${MWW}"

echo "   ===== Checking piper-sample-generator ====="
PSG="${DATA_DIR}/tools/piper-sample-generator"
if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then
    rm -rf "${PSG}" || :
    echo "   Cloning piper-sample-generator to ${DATA_DIR}/tools"
    git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
fi
echo "   Installing piper-sample-generator"
pip_install -e "${PSG}"
git -C tools/piper-sample-generator clean -fd &>/dev/null

MODELS_DIR="${PSG}/models"
VOICES_DIR="${PSG}/voices"
mkdir -p "${VOICES_DIR}"

# --- English generator model (multi-speaker, used with --language=en) ---
MODEL_NAME="en_US-libritts_r-medium.pt"
MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}"
MODEL_URL="https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/${MODEL_NAME}"
if [ ! -f "${MODEL_FILE}" ] ; then
    echo "      Downloading ${MODEL_NAME} for piper-sample-generator"
    curl -sfL "${MODEL_URL}" -o "${MODEL_FILE}"
fi

if [ ! -f "${MODEL_FILE}.json" ] ; then
    echo "      Downloading ${MODEL_NAME}.json for piper-sample-generator"
    curl -sfL "${MODEL_URL}.json" -o "${MODEL_FILE}.json"
fi

# --- Dutch ONNX voices (single-speaker, used with --language=nl) ---
# Working Dutch voices: pim, ronnie (nl_NL) and nathalie (nl_BE).
# nl_NL-mls-medium is intentionally excluded (known Piper issue: outputs gibberish).
HF_VOICES="https://huggingface.co/rhasspy/piper-voices/resolve/main"
declare -a NL_VOICES=(
    "nl/nl_NL/pim/medium/nl_NL-pim-medium"
    "nl/nl_NL/ronnie/medium/nl_NL-ronnie-medium"
    "nl/nl_BE/nathalie/medium/nl_BE-nathalie-medium"
)
echo "   ===== Checking Dutch Piper voices ====="
for voice_path in "${NL_VOICES[@]}" ; do
    voice_name="$(basename "${voice_path}")"
    onnx_file="${VOICES_DIR}/${voice_name}.onnx"
    json_file="${VOICES_DIR}/${voice_name}.onnx.json"
    if [ ! -f "${onnx_file}" ] ; then
        echo "      Downloading ${voice_name}.onnx"
        curl -sfL "${HF_VOICES}/${voice_path}.onnx?download=true" -o "${onnx_file}"
    fi
    if [ ! -f "${json_file}" ] ; then
        echo "      Downloading ${voice_name}.onnx.json"
        curl -sfL "${HF_VOICES}/${voice_path}.onnx.json?download=true" -o "${json_file}"
    fi
done

${GPU} && onnxgpu='-gpu[cuda]' || onnxgpu=""
echo "   ===== Installing onnxruntime${onnxgpu} ====="
pip_install "onnxruntime${onnxgpu}>=1.16.0"

echo "   ===== Installing keras ====="
# Default: keep the known-good pin with stable TF 2.20.
# For tf-nightly/custom TF specs, skip this pin unless explicitly requested.
if [ -n "${MWW_KERAS_SPEC:-}" ] ; then
    pip_install "${MWW_KERAS_SPEC}"
elif [ -n "${MWW_TF_SPEC:-}" ] || [[ "${TF_SPEC}" == tf-nightly* ]] ; then
    echo "   Skipping explicit keras pin for ${TF_SPEC} (set MWW_KERAS_SPEC to force one)."
else
    pip_install "keras==3.12.0"
fi

# -----------------------------------------------------------------------------
# Optional CUDA data dir (GPU-only)
# Some stacks expect a CUDA "nvvm/libdevice" tree. We create one in /data/cuda
# and link Triton's libdevice if it exists. This is safe and does NOT enable
# any extra XLA flags by itself.
# -----------------------------------------------------------------------------
if ${GPU} ; then
    CUDA_DATA_DIR="${DATA_DIR}/cuda"
    LIBDEVICE_DIR="${CUDA_DATA_DIR}/nvvm/libdevice"
    mkdir -p "${LIBDEVICE_DIR}"

    TRITON_LIBDEVICE="$(
        python - <<'PY'
import glob
paths = glob.glob("**/site-packages/triton/backends/nvidia/lib/libdevice.10.bc", recursive=True)
print(paths[0] if paths else "", end="")
PY
    )"

    if [ -n "${TRITON_LIBDEVICE}" ] ; then
        ln -sf "${TRITON_LIBDEVICE}" "${LIBDEVICE_DIR}/libdevice.10.bc"
        echo "   Linked Triton libdevice.10.bc to ${LIBDEVICE_DIR}"
    else
        echo "   ℹ️  Triton libdevice.10.bc not found (ok)"
    fi
fi

"${PROGDIR}/test_python" --data-dir="${DATA_DIR}"

touch .mww-data-dir
END_TS=$EPOCHSECONDS

echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."

print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"