mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
Train from the command line
The files in the `cli` directory allow you to train wake words from the command line without needing to use the Jupyter notebook or a web browser. Basically, the logic from the notebook has been placed in separate shell scripts and python files wrapped by 3 high-level scripts that do the following: * setup_python_venv: Creates a Python virtual environment with all the packages needed to train. The venv is created in the container's /data directory and is therefore stored on the host, not in the container's root docker volume. * setup_training_datasets: Downloads, extracts and converts the MIT RIR, FMA, Audioset and Negative training reference datasets. Also stored in /data. * train_wake_word: Generates the wake word samples, augments them with the audio from the training datasets, and finally runs the microwakeword training. The resulting model tflite and json files are placed in the /data/output directory. See the README.md file for much more information.
This commit is contained in:
135
cli/.bashrc
Normal file
135
cli/.bashrc
Normal file
@@ -0,0 +1,135 @@
|
||||
# ~/.bashrc: executed by bash(1) for non-login shells.
|
||||
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
|
||||
# for examples
|
||||
|
||||
# If not running interactively, don't do anything
|
||||
[ -z "$PS1" ] && return
|
||||
|
||||
# don't put duplicate lines in the history. See bash(1) for more options
|
||||
# ... or force ignoredups and ignorespace
|
||||
HISTCONTROL=ignoredups:ignorespace
|
||||
|
||||
# append to the history file, don't overwrite it
|
||||
shopt -s histappend
|
||||
|
||||
# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
|
||||
HISTSIZE=1000
|
||||
HISTFILESIZE=2000
|
||||
|
||||
# check the window size after each command and, if necessary,
|
||||
# update the values of LINES and COLUMNS.
|
||||
shopt -s checkwinsize
|
||||
|
||||
# make less more friendly for non-text input files, see lesspipe(1)
|
||||
[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"
|
||||
|
||||
# set variable identifying the chroot you work in (used in the prompt below)
|
||||
if [ -z "$debian_chroot" ] && [ -r /etc/debian_chroot ]; then
|
||||
debian_chroot=$(cat /etc/debian_chroot)
|
||||
fi
|
||||
|
||||
# set a fancy prompt (non-color, unless we know we "want" color)
|
||||
case "$TERM" in
|
||||
xterm-color) color_prompt=yes;;
|
||||
esac
|
||||
|
||||
# uncomment for a colored prompt, if the terminal has the capability; turned
|
||||
# off by default to not distract the user: the focus in a terminal window
|
||||
# should be on the output of commands, not on the prompt
|
||||
#force_color_prompt=yes
|
||||
|
||||
if [ -n "$force_color_prompt" ]; then
|
||||
if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
|
||||
# We have color support; assume it's compliant with Ecma-48
|
||||
# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
|
||||
# a case would tend to support setf rather than setaf.)
|
||||
color_prompt=yes
|
||||
else
|
||||
color_prompt=
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$color_prompt" = yes ]; then
|
||||
PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
|
||||
else
|
||||
PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
|
||||
fi
|
||||
unset color_prompt force_color_prompt
|
||||
|
||||
# If this is an xterm set the title to user@host:dir
|
||||
case "$TERM" in
|
||||
xterm*|rxvt*)
|
||||
PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
|
||||
# enable color support of ls and also add handy aliases
|
||||
if [ -x /usr/bin/dircolors ]; then
|
||||
test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
|
||||
alias ls='ls --color=auto'
|
||||
#alias dir='dir --color=auto'
|
||||
#alias vdir='vdir --color=auto'
|
||||
|
||||
alias grep='grep --color=auto'
|
||||
alias fgrep='fgrep --color=auto'
|
||||
alias egrep='egrep --color=auto'
|
||||
fi
|
||||
|
||||
# some more ls aliases
|
||||
alias ll='ls -alF'
|
||||
alias la='ls -A'
|
||||
alias l='ls -CF'
|
||||
|
||||
# Alias definitions.
|
||||
# You may want to put all your additions into a separate file like
|
||||
# ~/.bash_aliases, instead of adding them here directly.
|
||||
# See /usr/share/doc/bash-doc/examples in the bash-doc package.
|
||||
|
||||
if [ -f ~/.bash_aliases ]; then
|
||||
. ~/.bash_aliases
|
||||
fi
|
||||
|
||||
# enable programmable completion features (you don't need to enable
|
||||
# this, if it's already enabled in /etc/bash.bashrc and /etc/profile
|
||||
# sources /etc/bash.bashrc).
|
||||
#if [ -f /etc/bash_completion ] && ! shopt -oq posix; then
|
||||
# . /etc/bash_completion
|
||||
#fi
|
||||
|
||||
if [ -f /data/.bashrc ]; then
|
||||
. /data/.bashrc
|
||||
fi
|
||||
|
||||
if ! mountpoint -q /data ; then
|
||||
cat <<-EOF >&2
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
EOF
|
||||
fi
|
||||
|
||||
if [ -d /data/.venv ]; then
|
||||
. /data/.venv/bin/activate
|
||||
else
|
||||
cat <<-EOF >&2
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run 'setup_python_venv'
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
EOF
|
||||
|
||||
fi
|
||||
alias venv='[ -d /data/.venv ] && source /data/.venv/bin/activate || echo "/data/.venv does not exist yet"'
|
||||
27
cli/Dockerfile
Normal file
27
cli/Dockerfile
Normal file
@@ -0,0 +1,27 @@
|
||||
# Since this is a pure python environment, we don't need to start
|
||||
# with a huge CUDA image. A standard Ubuntu image will do.
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_ROOT_USER_ACTION=ignore \
|
||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
||||
PATH="/root/mww-scripts:${PATH}"
|
||||
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
||||
git wget curl unzip ca-certificates nano less \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /data
|
||||
|
||||
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
||||
COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
|
||||
test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
|
||||
|
||||
# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash
|
||||
# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
|
||||
# to timeout then SIGKILL the container.
|
||||
# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
|
||||
CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
|
||||
507
cli/README.md
Normal file
507
cli/README.md
Normal file
@@ -0,0 +1,507 @@
|
||||
# Run training from the command line
|
||||
|
||||
## Overview
|
||||
|
||||
With these scripts and Dockerfile, you can train new wake words from the
|
||||
command line without using a Jupyter notebook.
|
||||
|
||||
Differences between this Docker image and the Jupyter notebook image:
|
||||
|
||||
* The Python training environment isn't included in the image. Instead, a
|
||||
"virtual environment" (venv) is created in the `/data` directory which you
|
||||
will have mounted to a host directory. This cuts about 7gb from the image
|
||||
and allows the virtualenv to persist across container instances.
|
||||
|
||||
* The logic from the Jupyter notebook is contained in individual Python
|
||||
and shell scripts
|
||||
|
||||
* No ports need to be exposed since the Jupyter notebook server isn't being
|
||||
run.
|
||||
|
||||
## TL;DR
|
||||
|
||||
For the impatient among you...
|
||||
|
||||
```shell
|
||||
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
||||
$ docker build -t microwakeword-cli:latest .
|
||||
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# setup_python_venv
|
||||
##### You have about 4 minutes to drink coffee
|
||||
|
||||
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
||||
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
||||
|
||||
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
||||
##### You have about 30-45 minutes for a nap depending on available system resources.
|
||||
##### You'll be informed of where to find your trained model.
|
||||
```
|
||||
|
||||
Load the trained model on your device and give it a try but don't be surprized
|
||||
if you get a lot of missed or false activations. Read on to find out why.
|
||||
|
||||
## Get Started
|
||||
|
||||
Good, you stuck around! Now read the rest of the document before doing
|
||||
anything.
|
||||
|
||||
### Using a GPU
|
||||
|
||||
Having an Nvidia GPU available can cut the training time by up to half. The
|
||||
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
||||
however so if you have an Nvidia GPU and want to use it for training, you'll
|
||||
need to install the official Nvidia driver from
|
||||
https://www.nvidia.com/en-in/drivers/unix/
|
||||
|
||||
### Build the image
|
||||
|
||||
You can use either Docker or Podman as your container management tool.
|
||||
`docker` is used in the examples but if you have podman, just substitute
|
||||
the command.
|
||||
|
||||
Start by navigating to the directory that contains this README file and
|
||||
the accompanying Dockerfile. Then...
|
||||
|
||||
|
||||
```shell
|
||||
docker build -t microwakeword-cli:latest .
|
||||
```
|
||||
|
||||
This should be fairly quick and result in an image that's about 320mb in size
|
||||
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
||||
|
||||
So why isn't a pre-built image available for download? Because it'll probably
|
||||
take longer to download a pre-built image than for you to create it locally.
|
||||
GitHub's container registry is notoriously erratic when it comes to download
|
||||
throughput.
|
||||
|
||||
### Create a host work directory
|
||||
|
||||
This directory will contain the Python virtual environment plus all of the
|
||||
downloaded and generated data needed for training and the final trained
|
||||
models. A full environment will need about 150gb of free space but read
|
||||
further to see how to reduce this.
|
||||
|
||||
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
||||
|
||||
The training container will start a Bash shell so if you have Bash
|
||||
aliases or Bashy things you like, create a `.bashrc` file in your
|
||||
`<host_data_dir>` and put them in there. It'll automatically be included
|
||||
any time you enter the container.
|
||||
|
||||
### Create and start the container
|
||||
|
||||
There are lots of options that control container creation. The simplest example
|
||||
will create the container and give you an interactive shell. When you exit the
|
||||
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
||||
intact.
|
||||
|
||||
```shell
|
||||
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
||||
```
|
||||
|
||||
Options:
|
||||
|
||||
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
||||
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
||||
around and give it a name for training more than one wake word. You
|
||||
can stop and remove it when you're ready.
|
||||
* Add a `-d` option to start the container in the background and use `docker
|
||||
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
||||
|
||||
When the container starts, you'll see:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run setup_python_venv
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
root@mww-cli:/#
|
||||
```
|
||||
|
||||
Don't worry about the python WARNING right now. You'll be creating the
|
||||
virtualenv in the next step.
|
||||
|
||||
If you've forgotton to create and/or mount your host data directory, you'll
|
||||
see an additional warning:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
```
|
||||
|
||||
You can certainly continue but it's a "really bad idea"™ because your
|
||||
container storage could grow from a few hundred mb to over 140gb.
|
||||
|
||||
At this point, you're in a Bash shell.
|
||||
|
||||
### Create the Python virtual environment
|
||||
|
||||
The Python virtual environment will contain all the software needed to train.
|
||||
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
||||
|
||||
The scripts that do all the work will be in the container's PATH so to setup
|
||||
the virtual environment and install all of the packages, just run:
|
||||
|
||||
```text
|
||||
setup_python_venv [ --verbose ]
|
||||
|
||||
Options:
|
||||
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
```
|
||||
|
||||
When the installation is finished, a test of the major components will be
|
||||
run.
|
||||
|
||||
Once the process is done, you should change to the `/data` directory and
|
||||
activate the virtual environment with:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data#
|
||||
```
|
||||
|
||||
Technically, you don't need to do either of these since the scripts
|
||||
are in the PATH and they know to use the `/data` directory for everything.
|
||||
It's more of an "if you're interested" thing.
|
||||
|
||||
At this point, you have a container with all software installed.
|
||||
|
||||
## Get the reference data
|
||||
|
||||
The training process itself relies on a significant amount of audio reference
|
||||
data that creates a simulated "audio environment" that your wake word will be
|
||||
trained in. These "training datasets" include things like varying amounts of
|
||||
reverberation, background music, background conversations, background noise,
|
||||
etc. All said and done, it amounts to about 30gb of audio but with the
|
||||
downloaded archives and extracted intermediate files, you'll need about 85gb
|
||||
of free space. Thankfully, you only need to download the files once no
|
||||
matter how many wake words you want to train and since it's stored in
|
||||
`/data`, you can even remove the docker container and recreate it without
|
||||
losing any of it. There are 4 datasets that are required.
|
||||
|
||||
This is a three stage process...
|
||||
|
||||
1. Download zipfiles or tarballs. (about 30gb)
|
||||
2. Extract them. (about 50gb)
|
||||
3. Convert them into the final form. (about 31gb)
|
||||
|
||||
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
||||
of the datasets doesn't need to be covnerted and is counted in both
|
||||
steps 2 and 3. You really do only need 85gb.
|
||||
|
||||
To download the archives, unpack them, and convert the audio to what's needed
|
||||
by the training process, run:
|
||||
|
||||
```text
|
||||
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
```
|
||||
|
||||
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
||||
|
||||
The script detects if the datasets have already been downloaded, extracted
|
||||
and/or converted and skips those steps as appropriate so if you've run the
|
||||
script without the cleanup options, you can just run it again with those
|
||||
options to clean them up.
|
||||
|
||||
Now you're ready to train a wake word. Almost.
|
||||
|
||||
## Train a Wake Word
|
||||
|
||||
Training is done in 3 stages.
|
||||
|
||||
1. Generate thousands of samples of the wake word with various voices,
|
||||
pitches, speeds, inflections, etc.
|
||||
2. Augment the samples with the training datasets to add background noise, etc.
|
||||
3. Run the Tensorflow training.
|
||||
|
||||
### Generate a sample for verification
|
||||
|
||||
Before you start the full process, you're going to want to generate a single
|
||||
wake word sample and play it back to ensure it sounds right. The wake word
|
||||
should be spelled phonetically to give the sample generator the best chance
|
||||
of success.
|
||||
|
||||
```text
|
||||
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
||||
===== Generating 1 sample of 'hey buster' =====
|
||||
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
||||
Successfully loaded the model
|
||||
Batch 1/0 complete
|
||||
Done
|
||||
Sample available at /data/work/test_sample/hey_buster.wav
|
||||
Play it from your host.
|
||||
```
|
||||
|
||||
You should then play that file from your host. The reason I used "hey buster"
|
||||
as the wake word is to demonstrate why it's important to generate and listen
|
||||
to a sample. If you try that exact input and play it back, you'll notice
|
||||
that the generator didn't capture the "er" at the end very well. To get it to
|
||||
do so, I had to add a period on the end as a "spacer".
|
||||
"hey buster." worked much better.
|
||||
|
||||
When you're happy with the sample, you can run the full process.
|
||||
|
||||
### Run the full training process
|
||||
|
||||
```text
|
||||
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: 20000
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: 100
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: 25000
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
```
|
||||
|
||||
By default, the training process creates 20,000 samples of your wake word and
|
||||
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
||||
in the [Extra Credit](#extra-credit) section below for
|
||||
why these are the defaults. Depending on resources available, this could take
|
||||
between 30 and 60 minutes.
|
||||
|
||||
The resulting tflite model files and logs will be placed in the
|
||||
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
||||
and will therefore be available from your host in the directory you mapped
|
||||
`/data` to. File names will have non-filename-friendly characters in your
|
||||
wake word changed to underscores to make things easier. You'll need both the
|
||||
tflite and json files to load on your device. Exactly how you load them
|
||||
depends on the device and is beyond the scope of this project.
|
||||
|
||||
The only real measure of success is how well the resulting model works
|
||||
on a real device. If you encounter too many missed or false activations,
|
||||
increasing the number of samples would probably improve the results more
|
||||
than increasing the number of training steps. See
|
||||
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
||||
|
||||
The output from the last step is filtered some by the script but still quite
|
||||
verbose. The full log will be available in the output directory as
|
||||
`training.log` if you're interested. Intepreting the log is beyond the scope
|
||||
of this project however.
|
||||
|
||||
You can train additional wake words or change the number of samples and
|
||||
training steps by simply running `train_wake_word` again. No need to repeat
|
||||
any of the earlier setup steps. If you change the wake word or the number of
|
||||
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
||||
If you only change the number of training steps, the data from the first two
|
||||
steps is still valid and only the 3rd step is run.
|
||||
|
||||
All of the intermediate data is stored in the `/data/work` directory which will
|
||||
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
||||
successfully generated and you're happy with the results, you can delete the
|
||||
`/data/work` directory.
|
||||
|
||||
### Training more than one wake word
|
||||
|
||||
Once you have a container running, you
|
||||
can easily train multiple wake words from your host:
|
||||
|
||||
```shell
|
||||
for wp in "hey_alexa" "hey_jenkins" ; do
|
||||
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
||||
done
|
||||
```
|
||||
|
||||
### Training time examples
|
||||
|
||||
Training times depend on lots of things. These are examples only.
|
||||
Your Mileage May Vary!!!
|
||||
|
||||
```text
|
||||
===============================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
||||
Augment 10000 samples Elapsed time: 0:04:05
|
||||
10000 training steps Elapsed time: 0:15:04
|
||||
==================================================
|
||||
Total Elapsed time: 0:25:26
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
||||
Augment 10000 samples Elapsed time: 0:03:40
|
||||
10000 training steps Elapsed time: 0:08:00
|
||||
======================================================
|
||||
Total Elapsed time: 0:12:09
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
||||
Augment 20000 samples Elapsed time: 0:07:04
|
||||
25000 training steps Elapsed time: 0:25:21
|
||||
======================================================
|
||||
Total Elapsed time: 0:43:03
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
||||
Augment 20000 samples Elapsed time: 0:07:05
|
||||
25000 training steps Elapsed time: 0:19:13
|
||||
======================================================
|
||||
Total Elapsed time: 0:27:11
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
||||
Augment 50000 samples Elapsed time: 0:20:22
|
||||
40000 training steps Elapsed time: 1:01:51
|
||||
==================================================
|
||||
Total Elapsed time: 1:53:00
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
||||
Augment 50000 samples Elapsed time: 0:19:13
|
||||
40000 training steps Elapsed time: 0:42:23
|
||||
======================================================
|
||||
Total Elapsed time: 1:03:44
|
||||
================================================================================
|
||||
|
||||
|
||||
```
|
||||
|
||||
The sample generation process is really the only one that uses multiple CPUs so
|
||||
having fewer CPU threads available will probably make little difference.
|
||||
|
||||
## Extra Credit
|
||||
|
||||
### Training defaults
|
||||
|
||||
If you plan on training multiple wake words, you can set your own default
|
||||
training parameters by creating a `/data/.defaults.env` file with the
|
||||
following contents:
|
||||
|
||||
```shell
|
||||
# Variable names follow the command line parameters converted to upper case
|
||||
# and with the dashes ('-') converted to underscores ('_').
|
||||
export SAMPLES=10000
|
||||
export TRAINING_STEPS=10000
|
||||
|
||||
# Don't use the GPU for any operations. Stick with the CPU only.
|
||||
##export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
```
|
||||
|
||||
### Examine your model with Tensorboard
|
||||
|
||||
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
||||
idea of how many training steps are needed before accuracy results stop
|
||||
improving. To use it, you'll have to expose port 6006 by adding `-p
|
||||
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
||||
Remember, the /data directory is mapped to a directory on your host so you
|
||||
can simply stop and delete the current container and recreate it with the new
|
||||
`docker run` command. No need to re-run any of the setup or training steps.
|
||||
|
||||
To start Tensorboard, run:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
||||
```
|
||||
|
||||
Now on your host, point your browser at `http://localhost:6006/`,
|
||||
click "SCALARS" at the top and take a look at the various charts. You'll see
|
||||
a "train" and "validation" item for each training run you've performed. It's
|
||||
the "train" items you're interested in.
|
||||
|
||||
<a id="tensorboard-results"></a>
|
||||
|
||||
You have to be a Tensorflow expert to decipher most of the charts but
|
||||
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
||||
seem to idicate that there's very little improvement after about 20,000
|
||||
training steps.
|
||||
|
||||

|
||||
|
||||
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
||||
20,000 training steps.
|
||||
|
||||

|
||||
|
||||
Given that it's faster to generate wake word samples than it is to train,
|
||||
20,000 samples and 25,000 training steps seems like a good compromise. This
|
||||
chart has a bit less smoothing to show a bit more detail and includes the
|
||||
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
||||
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
||||
25,000 are the defaults for these scripts.
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
53
cli/cudainfo
Executable file
53
cli/cudainfo
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys, glob
|
||||
|
||||
devices = glob.glob("/dev/nvidia[0-9]")
|
||||
if len(devices) == 0:
|
||||
print("CUDA not available or no CUDA-capable GPU found.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
cc_cores_per_SM_dict = {
|
||||
(2,0) : 32,
|
||||
(2,1) : 48,
|
||||
(3,0) : 192,
|
||||
(3,5) : 192,
|
||||
(3,7) : 192,
|
||||
(5,0) : 128,
|
||||
(5,2) : 128,
|
||||
(6,0) : 64,
|
||||
(6,1) : 128,
|
||||
(7,0) : 64,
|
||||
(7,5) : 64,
|
||||
(8,0) : 64,
|
||||
(8,6) : 128,
|
||||
(8,9) : 128,
|
||||
(9,0) : 128,
|
||||
(10,0) : 128,
|
||||
(12,0) : 128
|
||||
}
|
||||
|
||||
try:
|
||||
from numba import cuda
|
||||
device = cuda.get_current_device()
|
||||
ctx = cuda.current_context()
|
||||
meminfo = ctx.get_memory_info()
|
||||
compute_capability = device.compute_capability
|
||||
sms = getattr(device, 'MULTIPROCESSOR_COUNT')
|
||||
cores_per_sm = cc_cores_per_SM_dict.get(compute_capability)
|
||||
if not cores_per_sm:
|
||||
cores_per_sm = "unknown"
|
||||
total_cores = "unknown"
|
||||
else:
|
||||
total_cores = cores_per_sm * sms
|
||||
|
||||
print(f" GPU Name: {device.name if type(device.name) is str else device.name.decode()}")
|
||||
print(f" Compute Capability: {'.'.join(list(map(str, compute_capability))):>7}")
|
||||
print(f"Streaming Multiprocessors: {sms:>7}")
|
||||
print(f" CUDA Cores per SM: {cores_per_sm:>7}")
|
||||
print(f" Total CUDA Cores: {total_cores:>7}")
|
||||
print(f" Total Memory: {meminfo.total / 1024 / 1024:>7.0f} mb")
|
||||
print(f" Free Memory: {meminfo.free / 1024 / 1024:>7.0f} mb")
|
||||
except Exception as e:
|
||||
print("CUDA not available or no CUDA-capable GPU found.")
|
||||
10
cli/requirements.txt
Normal file
10
cli/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
# --- Packages needed by our scripts ---
|
||||
|
||||
numpy==1.26.4
|
||||
scipy==1.12.0
|
||||
librosa==0.10.2.post1
|
||||
soundfile==0.12.1
|
||||
tqdm==4.67.1
|
||||
scikit-learn==1.6.0
|
||||
numba==0.63.1
|
||||
PyYAML==6.0.3
|
||||
175
cli/setup_audioset
Executable file
175
cli/setup_audioset
Executable file
@@ -0,0 +1,175 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
echo "***** Checking audioset *****"
|
||||
|
||||
AUDIO_URL="https://huggingface.co/datasets/agkphysics/AudioSet/resolve"
|
||||
AUDIO_DIR="./audioset"
|
||||
mkdir -p "${AUDIO_DIR}"
|
||||
AUDIO16K_DIR="./audioset_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}"
|
||||
AUDIO_FILECOUNT="./downloads/audioset_filecount"
|
||||
AUDIO_IN_GLOB="*.flac"
|
||||
|
||||
declare -A filecounts
|
||||
for i in {0..9} ; do
|
||||
fname="bal_train0${i}.tar"
|
||||
filecounts[${fname}]=0
|
||||
done
|
||||
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
|
||||
REV_CANDIDATES=(
|
||||
"6762f044d1c88619c7f2006486036192128fb07e"
|
||||
"0049167e89f259a010c3f070fe3666d9e5242836"
|
||||
"ceb9eaaa7844c9ad7351e659c84a572e376ad06d"
|
||||
"main"
|
||||
)
|
||||
|
||||
TAR_PATTERNS=(
|
||||
"data/bal_train0"
|
||||
"data/bal_train/bal_train0"
|
||||
)
|
||||
|
||||
find_rev() {
|
||||
for rev in "${REV_CANDIDATES[@]}" ; do
|
||||
for pattern in "${TAR_PATTERNS[@]}" ; do
|
||||
url="https://huggingface.co/datasets/agkphysics/AudioSet/resolve/${rev}/${pattern}0.tar"
|
||||
curl -I -L --fail -s "${url}" > /dev/null && echo "${rev},${pattern}"
|
||||
done
|
||||
done
|
||||
echo ""
|
||||
}
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
audioset_dir = Path(sys.argv[1])
|
||||
audioset_out = Path(sys.argv[2])
|
||||
|
||||
# convert FLAC → 16k mono WAV
|
||||
flacs = list(audioset_dir.rglob("*.flac"))
|
||||
print(f" FLAC files: {len(flacs)}")
|
||||
audioset_bad = []
|
||||
ok = 0
|
||||
for p in tqdm(flacs, desc=" AudioSet→WAV (resample 16k mono)"):
|
||||
try:
|
||||
outfile = Path(audioset_out / (p.stem + ".wav"))
|
||||
if outfile.exists():
|
||||
continue
|
||||
y, _ = librosa.load(p, sr=16000, mono=True)
|
||||
if y.size == 0:
|
||||
raise ValueError("empty audio")
|
||||
write_wav(outfile, y, 16000)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
audioset_bad.append(f"{p}:{e}")
|
||||
|
||||
if audioset_bad:
|
||||
(audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
|
||||
print(f" AudioSet complete ({ok} ok, {len(audioset_bad)} failed)")
|
||||
EOF
|
||||
}
|
||||
|
||||
expected_filecount=$(get_total_filecount filecounts)
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing Audioset valid"
|
||||
else
|
||||
dl=$(find_rev)
|
||||
[ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
|
||||
rev=${dl%%,*}
|
||||
pattern=${dl##*,}
|
||||
echo " Checking 10 tarballs"
|
||||
for i in {0..9} ; do
|
||||
fname="downloads/bal_train0${i}.tar"
|
||||
if [ ! -f "${fname}" ] ; then
|
||||
echo " Downloading bal_train0${i}.tar"
|
||||
url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
|
||||
curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
|
||||
fi
|
||||
|
||||
tarball_filecount=$(tar -tvf "${fname}" | wc -l )
|
||||
filecounts["bal_train0${i}.tar"]=${tarball_filecount}
|
||||
write_filecount=true
|
||||
|
||||
echo " Untarring bal_train0${i}.tar"
|
||||
tar -xf "${fname}" -C "${AUDIO_DIR}"
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
|
||||
echo " Cleaning up bal_train0${i}.tar"
|
||||
rm -rf "${fname}"
|
||||
fi
|
||||
done
|
||||
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
|
||||
converter
|
||||
if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then
|
||||
failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l)
|
||||
filecounts[failed]=-${failed}
|
||||
fi
|
||||
expected_filecount=$(get_total_filecount filecounts)
|
||||
actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" ; then
|
||||
for i in {0..9} ; do
|
||||
fname="downloads/bal_train0${i}.tar"
|
||||
if [ -f "${fname}" ] ; then
|
||||
echo " Cleaning up bal_train0${i}.tar"
|
||||
rm -rf "${fname}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " Audioset complete"
|
||||
exit 0
|
||||
|
||||
131
cli/setup_fma
Executable file
131
cli/setup_fma
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
echo "***** Checking FMA *****"
|
||||
|
||||
AUDIO_URL="https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip"
|
||||
AUDIO_ZIPFILE="fma_xs.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="fma"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
AUDIO16K_DIR="fma_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}" || :
|
||||
AUDIO_FILECOUNT="./downloads/fma_filecount"
|
||||
AUDIO_IN_GLOB="*.mp3"
|
||||
|
||||
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
fma_dir = Path(sys.argv[1])
|
||||
fma_out = Path(sys.argv[2])
|
||||
|
||||
# convert MP3 → 16k mono WAV
|
||||
mp3s = list(fma_dir.rglob("*.mp3"))
|
||||
print(f" MP3 files: {len(mp3s)}")
|
||||
fma_bad = []
|
||||
ok = 0
|
||||
for p in tqdm(mp3s, desc=" FMA→WAV (resample 16k mono)"):
|
||||
try:
|
||||
outfile = Path(fma_out / (p.stem + ".wav"))
|
||||
if outfile.exists():
|
||||
continue
|
||||
y, _ = librosa.load(p, sr=16000, mono=True)
|
||||
if y.size == 0:
|
||||
raise ValueError("empty audio")
|
||||
write_wav(outfile, y, 16000)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
fma_bad.append(f"{p}:{e}")
|
||||
|
||||
if fma_bad:
|
||||
(fma_out / "fma_corrupted_files.log").write_text("\n".join(fma_bad))
|
||||
print(f" FMA complete ({ok} ok, {len(fma_bad)} failed)")
|
||||
EOF
|
||||
|
||||
}
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find ${AUDIO16K_DIR} -name '*.wav' 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing FMA valid"
|
||||
else
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
mkdir "${AUDIO_DIR}"
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}"
|
||||
fi
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
converter
|
||||
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " FMA complete"
|
||||
exit 0
|
||||
|
||||
124
cli/setup_mit_audio
Executable file
124
cli/setup_mit_audio
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
AUDIO_URL="https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip"
|
||||
AUDIO_ZIPFILE="MIT_RIR_Audio.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="./mit_rirs"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
AUDIO16K_DIR="./mit_rirs_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}" || :
|
||||
AUDIO_FILECOUNT="./downloads/mit_rir_filecount"
|
||||
AUDIO_IN_GLOB="*.wav"
|
||||
|
||||
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
echo "===== Checking MIT_RIR ====="
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
rir_in = Path(sys.argv[1])
|
||||
rir_out = Path(sys.argv[2])
|
||||
|
||||
waves = list(rir_in.rglob("*.wav"))
|
||||
try:
|
||||
print(" MIT RIR normalizing to 16k…")
|
||||
# Normalize to 16k mono
|
||||
for p in tqdm(waves, desc=" MIT_RIR (resample 16k mono)"):
|
||||
outfile = Path(rir_out / p.name)
|
||||
if outfile.exists():
|
||||
continue
|
||||
a, sr = sf.read(p, always_2d=False)
|
||||
if a.ndim > 1:
|
||||
a = a[:, 0]
|
||||
if sr != 16000:
|
||||
a, _ = librosa.load(p, sr=16000, mono=True)
|
||||
write_wav(outfile, a, 16000)
|
||||
print(" MIT RIR normalization complete")
|
||||
except Exception as e2:
|
||||
print(f" MIT RIR fallback failed: {e2}")
|
||||
raise
|
||||
EOF
|
||||
}
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing ${AUDIO16K_DIR} valid"
|
||||
else
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -u -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}"
|
||||
fi
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
converter
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " MIT_RIR complete"
|
||||
exit 0
|
||||
85
cli/setup_negative_datasets
Executable file
85
cli/setup_negative_datasets
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
mkdir -p ./negative_datasets || :
|
||||
|
||||
NEGATIVE_DATASET_URL="https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main"
|
||||
declare -a NEGATIVE_DATASETS=( dinner_party dinner_party_eval no_speech speech )
|
||||
AUDIO_FILECOUNT="./downloads/negative_filecount"
|
||||
|
||||
declare -A filecounts=( [dinner_party.zip]=0 [dinner_party_eval.zip]=0 [no_speech.zip]=0 [speech.zip]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
echo "===== Checking negative datasets: ${NEGATIVE_DATASETS[*]} ====="
|
||||
write_filecount=false
|
||||
|
||||
for ds in "${NEGATIVE_DATASETS[@]}" ; do
|
||||
AUDIO_ZIPFILE="${ds}.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="./negative_datasets/${ds}"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing ${ds} valid"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${NEGATIVE_DATASET_URL}/${ds}.zip" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -q -d "./negative_datasets" "${AUDIO_ZIP}"
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
done
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" ; then
|
||||
for ds in "${NEGATIVE_DATASETS[@]}" ; do
|
||||
AUDIO_ZIPFILE="${ds}.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
if [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
echo " Negative datasets complete"
|
||||
|
||||
183
cli/setup_python_venv
Executable file
183
cli/setup_python_venv
Executable file
@@ -0,0 +1,183 @@
|
||||
#!/bin/bash
|
||||
PROGDIR="$(dirname $(realpath $0))"
|
||||
|
||||
KNOWN_ARGS=( data-dir python gpu no-gpu )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: setup_python_venv [ --gpu | --no-gpu ] [ --verbose ]
|
||||
|
||||
Options:
|
||||
--gpu: Install the GPU-capable versions of packages if available. This
|
||||
is the default if the script detects that a GPU is available.
|
||||
|
||||
--no-gpu: Install the non-GPU-capable versions of packages even if
|
||||
GPU-capable packages are available. This is the default if the script
|
||||
detects that a GPU is NOT available.
|
||||
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
|
||||
[ -d "${DATA_DIR}" ] || {
|
||||
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
|
||||
[ -z "${GPU}" ] && {
|
||||
GPU=false
|
||||
[ -c /dev/nvidiactl ] && {
|
||||
GPU=true
|
||||
echo " Nvidia GPU detected"
|
||||
}
|
||||
}
|
||||
|
||||
"${GPU}" || export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
VENV="${DATA_DIR}/.venv"
|
||||
[ -n "${VIRTUAL_ENV}" ] && deactivate
|
||||
|
||||
if [ -n "${PYTHON}" ] ; then
|
||||
PYTHONS=( "${PYTHON}" )
|
||||
unset PYTHON
|
||||
else
|
||||
PYTHONS=( python3.12 python3.10 )
|
||||
fi
|
||||
|
||||
for p in "${PYTHONS[@]}" ; do
|
||||
"${p}" --version &>/dev/null && { PYTHON="${p}" ; break ; }
|
||||
done
|
||||
|
||||
[ -n "${PYTHON}" ] || {
|
||||
echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -d "${VENV}" ] ; then
|
||||
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
|
||||
source "${VENV}/bin/activate" || {
|
||||
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
||||
exit 1
|
||||
}
|
||||
else
|
||||
rm -rf "${VENV}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "===== Setting up Python environment ${VENV} ====="
|
||||
|
||||
if [ -z "$VIRTUAL_ENV" ] ; then
|
||||
echo " ===== Creating new virtualenv at '${VENV}' ====="
|
||||
else
|
||||
echo " ===== Updating virtualenv at '${VENV}' ====="
|
||||
fi
|
||||
${PYTHON} -m venv --upgrade-deps "${VENV}"
|
||||
source "${VENV}/bin/activate"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
|
||||
progfiles+=( "${PROGDIR}/shell.functions" )
|
||||
|
||||
for f in "${progfiles[@]}" ; do
|
||||
ln -sfr "${f}" ".venv/bin/$(basename ${f})"
|
||||
done
|
||||
|
||||
#
|
||||
# Pip doesn't process packages from requirements.txt in
|
||||
# order but order is important because tensorflow, torch,
|
||||
# onnxruntime and micro-wake-word all depend on CUDA packages
|
||||
# at various versions. They need to be installed in this specific
|
||||
# order or they may not be able to use the GPU.
|
||||
#
|
||||
export PIP_PROGRESS_BAR=off
|
||||
export PIP_NO_COLOR=1
|
||||
export PIP_QUIET=0
|
||||
|
||||
pip_install() {
|
||||
if $VERBOSE ; then
|
||||
pip install "$@" || return 1
|
||||
else
|
||||
{ pip install "$@" || return 1 ; } | stdbuf -i0 -o0 tr -d '[:print:]' | stdbuf -i0 -o0 tr '\n' '.'
|
||||
fi
|
||||
echo
|
||||
}
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
echo " ===== Installing common requirements ====="
|
||||
pip_install -r "${PROGDIR}/requirements.txt"
|
||||
|
||||
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
|
||||
echo " ===== Installing Tensorflow${tfgpu} ====="
|
||||
pip_install ai_edge_litert "tensorflow${tfgpu}==2.20.0" "tensorboard==2.20.0" \
|
||||
"tensorboard-data-server==0.7.2"
|
||||
|
||||
${GPU} && torchgpu='--index-url https://download.pytorch.org/whl/cu129' || torchgpu=""
|
||||
echo " ===== Installing torch and torchaudio ${torchgpu:+[cuda]} ====="
|
||||
pip_install "torch==2.9.1" "torchaudio==2.9.1" ${torchgpu}
|
||||
|
||||
echo " ===== Checking microwakeword ====="
|
||||
MWW="${DATA_DIR}/tools/microWakeWord"
|
||||
if [ ! -d "${MWW}" ] || [ -n "$(git -C "${MWW}" status --porcelain)" ] ; then
|
||||
rm -rf "${MWW}" || :
|
||||
echo " Cloning micro-wake-word to ${DATA_DIR}/tools"
|
||||
git clone https://github.com/TaterTotterson/micro-wake-word "${MWW}" &>/dev/null
|
||||
fi
|
||||
echo " Installing microwakeword"
|
||||
pip_install -e "${MWW}"
|
||||
|
||||
echo " ===== Checking piper-sample-generator ====="
|
||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||
if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
|
||||
rm -rf "${PSG}" || :
|
||||
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
|
||||
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
|
||||
fi
|
||||
echo " Installing piper-sample-generator"
|
||||
pip_install -e "${PSG}"
|
||||
git -C tools/piper-sample-generator clean -fd &>/dev/null
|
||||
|
||||
MODELS_DIR="${PSG}/models"
|
||||
MODEL_NAME="en_US-libritts_r-medium.pt"
|
||||
MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}"
|
||||
MODEL_URL="https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/${MODEL_NAME}"
|
||||
if [ ! -f "${MODEL_FILE}" ] ; then
|
||||
echo " Downloading ${MODEL_NAME} for piper-sample-generator"
|
||||
curl -sfL "${MODEL_URL}" -o "${MODEL_FILE}"
|
||||
fi
|
||||
|
||||
if [ ! -f "${MODEL_FILE}.json" ] ; then
|
||||
echo " Downloading ${MODEL_NAME}.json for piper-sample-generator"
|
||||
curl -sfL "${MODEL_URL}.json" -o "${MODEL_FILE}.json"
|
||||
fi
|
||||
|
||||
${GPU} && onnxgpu='-gpu[cuda]' || onnxgpu=""
|
||||
echo " ===== Installing onnxruntime${onnxgpu} ====="
|
||||
pip_install "onnxruntime${onnxgpu}>=1.16.0"
|
||||
|
||||
echo " ===== Installing keras ====="
|
||||
# keras 3.13 has "issues" so we need to back down to 3.12.
|
||||
pip_install "keras==3.12.0"
|
||||
|
||||
${PROGDIR}/test_python --data-dir="${DATA_DIR}"
|
||||
|
||||
touch .mww-data-dir
|
||||
END_TS=$EPOCHSECONDS
|
||||
|
||||
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
|
||||
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
|
||||
|
||||
|
||||
48
cli/setup_training_datasets
Executable file
48
cli/setup_training_datasets
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
echo -e "\n===== Setting up Training Datasets =====\n"
|
||||
|
||||
${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
END_TS=$(date +%s.%N)
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
|
||||
150
cli/shell.functions
Normal file
150
cli/shell.functions
Normal file
@@ -0,0 +1,150 @@
|
||||
|
||||
if [ "$0" == "${BASH_SOURCE[0]}" ] ; then
|
||||
echo "${BASH_SOURCE[0]} is meant to be 'sourced' not run directly" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -v DATA_DIR ] ; then
|
||||
[ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data"
|
||||
fi
|
||||
|
||||
DEFAULT_SAMPLES=20000
|
||||
DEFAULT_BATCH_SIZE=100
|
||||
DEFAULT_TRAINING_STEPS=25000
|
||||
|
||||
[ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || :
|
||||
|
||||
: "${SAMPLES:=${DEFAULT_SAMPLES}}"
|
||||
: "${BATCH_SIZE:=${DEFAULT_BATCH_SIZE}}"
|
||||
: "${TRAINING_STEPS:=${DEFAULT_TRAINING_STEPS}}"
|
||||
: "${CLEANUP_WORK_DIR:=false}"
|
||||
: "${CLEANUP_ARCHIVES:=false}"
|
||||
: "${CLEANUP_INTERMEDIATE_FILES:=false}"
|
||||
: "${QUIET:=false}"
|
||||
: "${VERBOSE:=false}"
|
||||
|
||||
HELP=false
|
||||
|
||||
if [ -v KNOWN_ARGS ] ; then
|
||||
KNOWN_ARGS+=( help verbose quiet h v q )
|
||||
fi
|
||||
declare -gi OPTION_COUNT=0
|
||||
declare -ga POSITIONAL_ARGS=()
|
||||
declare -ga EXTRA_ARGS=()
|
||||
declare -ga UNKNOWN_ARGS=()
|
||||
declare -i __stop_parsing=0
|
||||
for a in "$@"; do
|
||||
if [ "$a" == "--" ] ; then
|
||||
__stop_parsing=1
|
||||
shift
|
||||
continue
|
||||
fi
|
||||
if [ $__stop_parsing == 1 ] ; then
|
||||
EXTRA_ARGS+=( "$a" )
|
||||
shift
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -v KNOWN_ARGS ] && [[ "${a}" =~ ^--?([^=]+)=?.* ]] ; then
|
||||
_arg=${BASH_REMATCH[1]}
|
||||
known=false
|
||||
for _k in "${KNOWN_ARGS[@]}" ; do
|
||||
[ "${_arg}" == "${_k}" ] && { known=true ; break ; } || :
|
||||
done
|
||||
$known || UNKNOWN_ARGS+=( "${a}" )
|
||||
fi
|
||||
OPTION_COUNT+=1
|
||||
case "$a" in
|
||||
-h | --help)
|
||||
HELP=true
|
||||
break
|
||||
;;
|
||||
-q | --quiet)
|
||||
QUIET=true
|
||||
break
|
||||
;;
|
||||
-v | --verbose)
|
||||
VERBOSE=true
|
||||
break
|
||||
;;
|
||||
--*=*)
|
||||
[[ $a =~ --([^=]+)=(.*) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var="${BASH_REMATCH[2]}"
|
||||
;;
|
||||
--no-*)
|
||||
[[ $a =~ --no-(.+) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var=false
|
||||
;;
|
||||
--*)
|
||||
[[ $a =~ --(.+) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var=true
|
||||
;;
|
||||
*)
|
||||
POSITIONAL_ARGS+=( "$a" )
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
print_elapsed_time() {
|
||||
print_seps=True
|
||||
if [ "$1" == "--no-separators" ] ; then
|
||||
shift
|
||||
print_seps=False
|
||||
fi
|
||||
local START_TS=${1:?"Usage: $0 <start_timestamp> <end_timestamp>"}
|
||||
local END_TS=${2:?"Usage: $0 <start_timestamp> <end_timestamp>"}
|
||||
message="${3}"
|
||||
python <<EOF
|
||||
from datetime import datetime
|
||||
st=datetime.fromtimestamp(int($START_TS))
|
||||
et=datetime.fromtimestamp(int($END_TS))
|
||||
msg=f"${message} Elapsed time: {et-st!s}"
|
||||
if ${print_seps}:
|
||||
print(f"{'=' * 80}")
|
||||
print(f"{msg:>80s}")
|
||||
if ${print_seps}:
|
||||
print(f"{'=' * 80}")
|
||||
EOF
|
||||
}
|
||||
|
||||
justify_text() {
|
||||
msg="${1:?Need a string}"
|
||||
len="${2:?Need a length}"
|
||||
printf "%*s\n" $(( (${#msg}+len)/2)) "${msg}"
|
||||
}
|
||||
|
||||
get_filecounts() {
|
||||
declare -ln fca=${1}
|
||||
local af=${2}
|
||||
if [ -f "${af}" ] ; then
|
||||
mapfile -t fc < <(cat "${af}")
|
||||
for ds in "${fc[@]}" ; do
|
||||
[[ "${ds}" =~ ^([^:]+):([0-9-]+)$ ]] && fca[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} || :
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
get_total_filecount() {
|
||||
declare -ln fca=${1}
|
||||
declare -li total=0
|
||||
for ds in "${fca[@]}" ; do
|
||||
total+=${ds}
|
||||
done
|
||||
echo $total
|
||||
}
|
||||
|
||||
write_filecounts() {
|
||||
declare -ln fca=${1}
|
||||
local af=${2}
|
||||
rm -rf "${af}" || :
|
||||
for ds in "${!fca[@]}" ; do
|
||||
echo "${ds}:${fca[${ds}]}" >> "${af}"
|
||||
done
|
||||
}
|
||||
18
cli/system_summary
Executable file
18
cli/system_summary
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
CUDA_INFO=$("${PROGDIR}/cudainfo")
|
||||
CUDA_CORES=$(sed -n -r -e "s/\s*Total\s+CUDA\s+Cores:\s+([0-9]+)$/\1/gp" <<<${CUDA_INFO})
|
||||
GPU_NAME="$(sed -n -r -e 's/\s*GPU\s+Name:\s+(.+)$/\1/gp' <<<${CUDA_INFO})"
|
||||
GPU_MEMORY="$(sed -n -r -e 's/\s*Total\s+Memory:\s*([0-9.]+).*/\1/gp' <<<${CUDA_INFO})"
|
||||
CPU_NAME="$(sed -n -r -e 's/model\s+name\s*:\s*(.+)$/\1/gp' /proc/cpuinfo | head -1)"
|
||||
CPU_CORES="$(nproc)"
|
||||
SYS_MEMORY="$(free -m | sed -n -r -e 's/Mem:\s+([0-9.]+)\s+.*/\1/gp')"
|
||||
|
||||
printf "CPU: %s (%d cores) Memory: %s mb\n" "${CPU_NAME}" "${CPU_CORES}" "${SYS_MEMORY}"
|
||||
if [ -z "${GPU_NAME}" ] ; then
|
||||
printf "GPU: N/A\n"
|
||||
else
|
||||
printf "GPU: %s (%d cores) Memory: %s mb\n" "${GPU_NAME}" "${CUDA_CORES}" "${GPU_MEMORY}"
|
||||
fi
|
||||
BIN
cli/tensorboard1.png
Normal file
BIN
cli/tensorboard1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
BIN
cli/tensorboard2.png
Normal file
BIN
cli/tensorboard2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
cli/tensorboard3.png
Normal file
BIN
cli/tensorboard3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 43 KiB |
129
cli/test_python
Executable file
129
cli/test_python
Executable file
@@ -0,0 +1,129 @@
|
||||
#!/bin/bash
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
TRAINING_STEPS=40000
|
||||
DATA_DIR=/data
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY="ERROR"
|
||||
|
||||
echo -e "\n===== Testing Python Environment =====\n"
|
||||
|
||||
echo -e "\n===== Testing Cuda =====\n"
|
||||
"${PROGDIR}/cudainfo"
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
|
||||
print("\n===== Testing Tensorflow =====\n")
|
||||
try:
|
||||
from ai_edge_litert.interpreter import Interpreter
|
||||
import tensorflow as tf
|
||||
|
||||
try:
|
||||
with tf.device("/GPU:0"):
|
||||
a = tf.random.normal([10000, 10000])
|
||||
b = tf.random.normal([10000, 10000])
|
||||
c = tf.matmul(a, b)
|
||||
if c.device.find("GPU") >= 0:
|
||||
result = "Available - " + c.device
|
||||
else:
|
||||
result = "Not available"
|
||||
except:
|
||||
result = "Not available"
|
||||
|
||||
print("GPU:", result)
|
||||
|
||||
try:
|
||||
with tf.device("/CPU:0"):
|
||||
a = tf.random.normal([10000, 10000])
|
||||
b = tf.random.normal([10000, 10000])
|
||||
c = tf.matmul(a, b)
|
||||
result = "Available - " + c.device
|
||||
except:
|
||||
result = "Not available"
|
||||
|
||||
print("CPU:", result)
|
||||
except:
|
||||
print("Tensorflow not available")
|
||||
EOF
|
||||
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
print("\n===== Testing Torch =====\n")
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print(f"GPU: Available - {torch.cuda.get_device_name(0)}")
|
||||
else:
|
||||
print("GPU:", "Not available")
|
||||
print("CPU:", "Available")
|
||||
except:
|
||||
print("Torch not available")
|
||||
EOF
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
print("\n===== Testing onnxruntime =====\n")
|
||||
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
|
||||
providers = ort.get_available_providers()
|
||||
if 'CUDAExecutionProvider' in providers:
|
||||
print("GPU:", "Available")
|
||||
else:
|
||||
print("GPU:", "Not available")
|
||||
|
||||
if 'CPUExecutionProvider' in providers:
|
||||
print("CPU:", "Available")
|
||||
else:
|
||||
print("CPU:", "Not available")
|
||||
|
||||
if 'TensorrtExecutionProvider' in providers:
|
||||
print("TensorRT:", "Available")
|
||||
else:
|
||||
print("TensorRT:", "Not available")
|
||||
except:
|
||||
print("onnxruntime not available")
|
||||
EOF
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
|
||||
print("\n===== Testing micro-wake-word =====\n")
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import librosa
|
||||
from mmap_ninja.ragged import RaggedMmap
|
||||
from microwakeword.audio.augmentation import Augmentation
|
||||
from microwakeword.audio.clips import Clips
|
||||
from microwakeword.audio.spectrograms import SpectrogramGeneration
|
||||
from microwakeword.audio.audio_utils import save_clip
|
||||
|
||||
print("micro-wake-word available")
|
||||
except:
|
||||
print("micro-wake-word not available")
|
||||
|
||||
print("")
|
||||
EOF
|
||||
|
||||
echo -e "===== Testing piper-sample-generator =====\n"
|
||||
|
||||
./tools/piper-sample-generator/generate_samples.py --help &>/dev/null && {
|
||||
echo "piper-sample-generator available"
|
||||
} || {
|
||||
echo "piper-sample-generator not available"
|
||||
}
|
||||
|
||||
echo
|
||||
echo -e "\n===== Python Environment Testing Complete =====\n"
|
||||
125
cli/train_wake_word
Executable file
125
cli/train_wake_word
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD=${POSITIONAL_ARGS[0]}
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: ${DEFAULT_SAMPLES}
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples per batch, the more memory is needed.
|
||||
Default: ${DEFAULT_BATCH_SIZE}
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: ${DEFAULT_TRAINING_STEPS}
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
mkdir -p "${DATA_DIR}/work" || :
|
||||
|
||||
[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" || :
|
||||
|
||||
if [ ! -v WAKE_WORD_TITLE ] ; then
|
||||
declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
|
||||
WAKE_WORD_TITLE="${WWNA[*]^}"
|
||||
elif [ -z "$WAKE_WORD_TITLE" ] ; then
|
||||
WAKE_WORD_TITLE="$WAKE_WORD"
|
||||
fi
|
||||
|
||||
printf "%-80s\n" "=" | tr ' ' "="
|
||||
echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
|
||||
"${PROGDIR}/cudainfo"
|
||||
echo
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
|
||||
"${PROGDIR}/wake_word_sample_generator" \
|
||||
--samples=${SAMPLES} \
|
||||
--batch-size=${BATCH_SIZE} \
|
||||
--data-dir="${DATA_DIR}" "${WAKE_WORD}"
|
||||
|
||||
POST_GEN_TS=$EPOCHSECONDS
|
||||
|
||||
ww="${WAKE_WORD// /_}"
|
||||
ww="${ww//./}"
|
||||
|
||||
AUGMENT=false
|
||||
GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
|
||||
AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
||||
|
||||
[ -d "${AUGMENTED_DIR}" ] || AUGMENT=true
|
||||
[ "${GENERATED_DIR}/0.wav" -nt "${AUGMENTED_DIR}/testing/wakeword_mmap/data.ninja" ] && AUGMENT=true || :
|
||||
|
||||
if ${AUGMENT} ; then
|
||||
rm -rf "${AUGMENTED_DIR}" || :
|
||||
mkdir -p "${AUGMENTED_DIR}" || :
|
||||
"${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
||||
else
|
||||
echo "Augmentation not required"
|
||||
echo
|
||||
fi
|
||||
|
||||
POST_AUGMENT_TS=$EPOCHSECONDS
|
||||
|
||||
"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \
|
||||
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
||||
|
||||
if ${CLEANUP_WORK_DIR} ; then
|
||||
rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \
|
||||
"${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || :
|
||||
fi
|
||||
END_TS=$EPOCHSECONDS
|
||||
|
||||
python -c $'print(f"{\'=\' * 80}")'
|
||||
printf "%44s\n\n" "Training Summary"
|
||||
"${PROGDIR}/system_summary"
|
||||
echo
|
||||
print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
|
||||
print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
|
||||
print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps"
|
||||
python -c $'msg="="*54 ; print(f"{msg:>80s}")'
|
||||
print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total"
|
||||
python -c $'print(f"{\'=\' * 80}")'
|
||||
215
cli/wake_word_sample_augmenter
Executable file
215
cli/wake_word_sample_augmenter
Executable file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys, os, gc, glob, random
|
||||
import types, shutil, json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from argparse import ArgumentParser as ArgParser, ArgumentError
|
||||
|
||||
default_data_dir = os.getcwd() if os.path.exists(".mww-data-dir") else "/data"
|
||||
|
||||
parser = ArgParser(exit_on_error=False)
|
||||
parser.add_argument("--data-dir", type=str, help=f"Data directory. Default: {default_data_dir}", required=False, default=default_data_dir)
|
||||
parser.add_argument("--input-dir", type=str, help="Sample input directory. Default: <data-dir>/work/wake_word_samples", required=False)
|
||||
parser.add_argument("--output-dir", type=str, help="Sample output directory. Default: <input-dir>_augmented", required=False)
|
||||
parser.add_argument("--mit-rirs-16k-dir", type=str, help="MIT RIR input directory. Default: <data-dir>/training_datasets/mit_rirs_16k", required=False)
|
||||
parser.add_argument("--fma-16k-dir", type=str, help="FMA input directory. Default: <data-dir>/training_datasets/fma_16k", required=False)
|
||||
parser.add_argument("--audioset-16k-dir", type=str, help="Audioset input directory. Default: <data-dir>/training_datasets/audioset_16k", required=False)
|
||||
|
||||
try:
|
||||
args = parser.parse_args()
|
||||
except ArgumentError:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
args.data_dir = os.path.realpath(args.data_dir)
|
||||
work_dir = args.data_dir + "/work"
|
||||
|
||||
if not args.input_dir:
|
||||
args.input_dir = work_dir + "/wake_word_samples"
|
||||
else:
|
||||
args.input_dir = os.path.realpath(args.input_dir)
|
||||
|
||||
if not args.output_dir:
|
||||
args.output_dir = args.input_dir + "_augmented"
|
||||
else:
|
||||
args.output_dir = os.path.realpath(args.output_dir)
|
||||
|
||||
if not args.mit_rirs_16k_dir:
|
||||
args.mit_rirs_16k_dir = args.data_dir + "/training_datasets/mit_rirs_16k"
|
||||
else:
|
||||
args.mit_rirs_16k_dir = os.path.realpath(args.mit_rirs_16k_dir)
|
||||
|
||||
if not args.fma_16k_dir:
|
||||
args.fma_16k_dir = args.data_dir + "/training_datasets/fma_16k"
|
||||
else:
|
||||
args.fma_16k_dir = os.path.realpath(args.fma_16k_dir)
|
||||
|
||||
if not args.audioset_16k_dir:
|
||||
args.audioset_16k_dir = args.data_dir + "/training_datasets/audioset_16k"
|
||||
else:
|
||||
args.audioset_16k_dir = os.path.realpath(args.audioset_16k_dir)
|
||||
|
||||
out_path = Path(args.output_dir)
|
||||
out_path.mkdir(exist_ok=True)
|
||||
|
||||
def validate_directories(paths):
|
||||
for path in paths:
|
||||
if not os.path.exists(path):
|
||||
print(f"Error: Directory {path} does not exist. Please ensure preprocessing is complete.")
|
||||
return False
|
||||
return True
|
||||
|
||||
paths = [ work_dir, args.input_dir, args.output_dir, args.mit_rirs_16k_dir, args.fma_16k_dir, args.audioset_16k_dir ]
|
||||
if not validate_directories(paths):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
files = glob.glob(args.input_dir + "/*.wav")
|
||||
if not files:
|
||||
raise RuntimeError("❌ No WAVs in wake_word_samples.")
|
||||
max_samples = len(files)
|
||||
|
||||
print(f"\n===== Augmenting {max_samples} wake word samples =====")
|
||||
|
||||
print(" Initializing libraries")
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
|
||||
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
|
||||
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
|
||||
os.environ["TF_XLA_FLAGS"]="--tf_xla_auto_jit=0"
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"]="1"
|
||||
os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"]="512"
|
||||
os.environ["GLOG_minloglevel"]="9"
|
||||
os.environ["GRPC_VERBOSITY"]="ERROR"
|
||||
|
||||
print(" Loading Tensorflow")
|
||||
import tensorflow as tf
|
||||
|
||||
print(" GPU memory config")
|
||||
# Per-device memory growth (belt + suspenders)
|
||||
for g in tf.config.list_physical_devices("GPU"):
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(g, True)
|
||||
except Exception:
|
||||
pass
|
||||
print(f" GPUs: {tf.config.list_physical_devices('GPU')}")
|
||||
gc.collect()
|
||||
|
||||
import numpy as np
|
||||
import librosa
|
||||
from mmap_ninja.ragged import RaggedMmap
|
||||
from microwakeword.audio.augmentation import Augmentation
|
||||
from microwakeword.audio.clips import Clips
|
||||
from microwakeword.audio.spectrograms import SpectrogramGeneration
|
||||
from microwakeword.audio.audio_utils import save_clip
|
||||
|
||||
START_TIME = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
|
||||
# Paths to augmented data
|
||||
impulse_paths = [ args.mit_rirs_16k_dir ]
|
||||
background_paths = [ args.fma_16k_dir, args.audioset_16k_dir]
|
||||
|
||||
clips = Clips(
|
||||
input_directory=args.input_dir,
|
||||
file_pattern='*.wav',
|
||||
max_clip_duration_s=5,
|
||||
remove_silence=True,
|
||||
random_split_seed=10,
|
||||
split_count=0.1,
|
||||
)
|
||||
|
||||
augmenter = Augmentation(
|
||||
augmentation_duration_s=3.2,
|
||||
augmentation_probabilities={
|
||||
"SevenBandParametricEQ": 0.1,
|
||||
"TanhDistortion": 0.05,
|
||||
"PitchShift": 0.15,
|
||||
"BandStopFilter": 0.1,
|
||||
"AddColorNoise": 0.1,
|
||||
"AddBackgroundNoise": 0.7,
|
||||
"Gain": 0.8,
|
||||
"RIR": 0.7,
|
||||
},
|
||||
impulse_paths=impulse_paths,
|
||||
background_paths=background_paths,
|
||||
background_min_snr_db=5,
|
||||
background_max_snr_db=10,
|
||||
min_jitter_s=0.2,
|
||||
max_jitter_s=0.3,
|
||||
)
|
||||
|
||||
# Augment samples and save the training, validation, and testing sets.
|
||||
|
||||
def audio_generator_from_wavs(self, split="train", repeat=1):
|
||||
"""
|
||||
Yield 1-D float32 arrays loaded via librosa from input_dir/*.wav.
|
||||
Deterministic 80/10/10 split with seed 10 to mirror original Clips behavior.
|
||||
"""
|
||||
files = sorted(glob.glob(args.input_dir + "/*.wav"))
|
||||
if not files:
|
||||
raise RuntimeError("❌ No WAVs in wake_word_samples.")
|
||||
|
||||
rng = random.Random(10) # deterministic shuffling like Clips(random_split_seed=10)
|
||||
files_shuf = files[:]
|
||||
rng.shuffle(files_shuf)
|
||||
|
||||
n = len(files_shuf)
|
||||
n_val = max(1, int(0.10 * n))
|
||||
n_test = max(1, int(0.10 * n))
|
||||
n_train = max(0, n - n_val - n_test)
|
||||
splits = {
|
||||
"train": files_shuf[:n_train],
|
||||
"validation": files_shuf[n_train:n_train + n_val],
|
||||
"test": files_shuf[n_train + n_val:],
|
||||
}
|
||||
file_list = splits.get(split, [])
|
||||
if not file_list:
|
||||
return # nothing to yield
|
||||
|
||||
for _ in range(max(1, int(repeat))):
|
||||
for p in file_list:
|
||||
y, sr = librosa.load(p, sr=16000, mono=True)
|
||||
yield y.astype(np.float32, copy=False)
|
||||
|
||||
# Bind the patched generator to your existing `clips` instance
|
||||
clips.audio_generator = types.MethodType(audio_generator_from_wavs, clips)
|
||||
|
||||
# ---- Split config (same as before) ----
|
||||
split_cfg = {
|
||||
"training": {"name": "train", "repetition": 2, "slide_frames": 10},
|
||||
"validation": {"name": "validation", "repetition": 1, "slide_frames": 10},
|
||||
"testing": {"name": "test", "repetition": 1, "slide_frames": 1},
|
||||
}
|
||||
|
||||
# ---- Generate features ----
|
||||
for split, cfg in split_cfg.items():
|
||||
out_dir = out_path / split
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(f" Augmenting {split}")
|
||||
|
||||
print(f" Generating spectrograms")
|
||||
spectros = SpectrogramGeneration(
|
||||
clips=clips, # now backed by our WAV loader
|
||||
augmenter=augmenter, # your existing augmenter
|
||||
slide_frames=cfg["slide_frames"],
|
||||
step_ms=10,
|
||||
)
|
||||
|
||||
print(f" Generating files")
|
||||
RaggedMmap.from_generator(
|
||||
out_dir=str(out_dir / "wakeword_mmap"),
|
||||
sample_generator=spectros.spectrogram_generator(
|
||||
split=cfg["name"], repeat=cfg["repetition"]
|
||||
),
|
||||
batch_size=100,
|
||||
verbose=False,
|
||||
)
|
||||
print(f" {split} augmentation complete")
|
||||
|
||||
END_TIME = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
et = END_TIME - START_TIME
|
||||
print(f"\n{'=' * 80}")
|
||||
msg=f"Augmented {max_samples} wake word samples."
|
||||
print(f"{msg:>50s} Elapsed time: {et!s}")
|
||||
print(f"{'=' * 80}\n")
|
||||
112
cli/wake_word_sample_generator
Executable file
112
cli/wake_word_sample_generator
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( samples batch-size data-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD="${POSITIONAL_ARGS[0]}"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --samples=<samples> ] [ --batch-size=<batch_size> ] <wake_word>
|
||||
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: ${DEFAULT_SAMPLES}
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: ${DEFAULT_BATCH_SIZE}
|
||||
|
||||
<wake_word> The word to generate samples for.
|
||||
Required.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
WORK_DIR="${DATA_DIR}/work"
|
||||
mkdir -p "${WORK_DIR}" || :
|
||||
cd "${WORK_DIR}"
|
||||
|
||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||
MODELS_DIR="${PSG}/models"
|
||||
MODEL_NAME=en_US-libritts_r-medium.pt
|
||||
MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}"
|
||||
SAMPLES_DIR="${WORK_DIR}/wake_word_samples"
|
||||
|
||||
mkdir -p "${SAMPLES_DIR}" || :
|
||||
|
||||
REGENERATE=false
|
||||
|
||||
if [ "${SAMPLES}" -eq 1 ] ; then
|
||||
echo "===== Generating ${SAMPLES} sample of '${WAKE_WORD}' ====="
|
||||
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
|
||||
|
||||
mkdir -p "${WORK_DIR}/test_sample" || :
|
||||
"${PSG}/generate_samples.py" "${WAKE_WORD}" \
|
||||
--model "${MODEL_FILE}" \
|
||||
--max-samples ${SAMPLES} \
|
||||
--batch-size ${BATCH_SIZE} \
|
||||
--output-dir "${WORK_DIR}/test_sample" \
|
||||
--max-speakers 100 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g"
|
||||
mv "${WORK_DIR}/test_sample/0.wav" "${WORK_DIR}/test_sample/${wake_word_filename}.wav"
|
||||
echo "Sample available at ${WORK_DIR}/test_sample/${wake_word_filename}.wav"
|
||||
echo "Play it from your host."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
grep -q "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" "${WORK_DIR}/last_wake_word" &>/dev/null || REGENERATE=true
|
||||
|
||||
# Double check that the number of existing samples matches SAMPLES"
|
||||
existing_samples=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l)
|
||||
[ "${existing_samples}" -eq "${SAMPLES}" ] || REGENERATE=true
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
if ! ${REGENERATE} ; then
|
||||
echo "Sample generation not required"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo -e "\n===== Generating ${SAMPLES} wake word samples in batches of ${BATCH_SIZE} ====="
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
echo " Generating samples"
|
||||
rm -rf "${SAMPLES_DIR}" || :
|
||||
mkdir -p "${SAMPLES_DIR}" || :
|
||||
"${PSG}/generate_samples.py" "${WAKE_WORD}" \
|
||||
--model "${MODEL_FILE}" \
|
||||
--max-samples ${SAMPLES} \
|
||||
--batch-size ${BATCH_SIZE} \
|
||||
--output-dir "${SAMPLES_DIR}" 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g"
|
||||
|
||||
generated_files=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l)
|
||||
if [ "${generated_files}" -ne "${SAMPLES}" ] ; then
|
||||
echo "ERROR: only generated ${generated_files} files" >&2
|
||||
exit 1
|
||||
fi
|
||||
END_TS=$(date +%s.%N)
|
||||
echo "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" > "${WORK_DIR}/last_wake_word"
|
||||
echo
|
||||
END_TS=$EPOCHSECONDS
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Generated ${SAMPLES} wake word samples."
|
||||
|
||||
exit 0
|
||||
241
cli/wake_word_sample_trainer
Executable file
241
cli/wake_word_sample_trainer
Executable file
@@ -0,0 +1,241 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( training-steps samples data-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD="${POSITIONAL_ARGS[0]}"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --samples=<samples> ] [ --training-steps=<steps> ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
$0 -h/--help
|
||||
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Used only to generate output file names.
|
||||
|
||||
--training-steps: Number of training steps.
|
||||
Default: ${DEFAULT_TRAINING_STEPS}
|
||||
|
||||
<wake_word>: The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title>: A pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORK_DIR="${DATA_DIR}/work"
|
||||
TRAINING_DS="${DATA_DIR}/training_datasets"
|
||||
|
||||
[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}"
|
||||
|
||||
if [ ! -v WAKE_WORD_TITLE ] ; then
|
||||
declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
|
||||
WAKE_WORD_TITLE="${WWNA[*]^}"
|
||||
elif [ -z "$WAKE_WORD_TITLE" ] ; then
|
||||
WAKE_WORD_TITLE="$WAKE_WORD"
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
check_directories() {
|
||||
for d in "$@" ; do
|
||||
[ -d "$d" ] || { echo "ERROR: Directory $d not found" >&2 ; exit 1 ; }
|
||||
done
|
||||
}
|
||||
|
||||
check_directories ${WORK_DIR}/wake_word_samples_augmented \
|
||||
${TRAINING_DS}/negative_datasets/{speech,dinner_party,no_speech,dinner_party_eval}
|
||||
|
||||
cd "${WORK_DIR}"
|
||||
|
||||
echo "===== Starting ${TRAINING_STEPS} training steps ====="
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
mkdir -p "${WORK_DIR}/trained_models" || :
|
||||
cat <<EOF >"${WORK_DIR}/trained_models/training_parameters.yaml"
|
||||
batch_size: 16
|
||||
clip_duration_ms: 1500
|
||||
eval_step_interval: 500
|
||||
features:
|
||||
- features_dir: ${WORK_DIR}/wake_word_samples_augmented
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 2.0
|
||||
truncation_strategy: truncate_start
|
||||
truth: true
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/speech
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 12.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 12.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/no_speech
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 5.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party_eval
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 0.0
|
||||
truncation_strategy: split
|
||||
truth: false
|
||||
type: mmap
|
||||
freq_mask_count:
|
||||
- 0
|
||||
freq_mask_max_size:
|
||||
- 0
|
||||
learning_rates:
|
||||
- 0.001
|
||||
maximization_metric: average_viable_recall
|
||||
minimization_metric: null
|
||||
negative_class_weight:
|
||||
- 20
|
||||
positive_class_weight:
|
||||
- 1
|
||||
target_minimization: 0.9
|
||||
time_mask_count:
|
||||
- 0
|
||||
time_mask_max_size:
|
||||
- 0
|
||||
train_dir: ${WORK_DIR}/trained_models/wakeword
|
||||
training_steps:
|
||||
- ${TRAINING_STEPS}
|
||||
window_step_ms: 10
|
||||
|
||||
EOF
|
||||
|
||||
echo " Wrote training_parameters.yaml"
|
||||
rm -rf "${WORK_DIR}/trained_models/wakeword"
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=9
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
echo " Loading Tensorflow"
|
||||
|
||||
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
|
||||
OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}"
|
||||
mkdir -p "${OUTPUT_DIR}/logs" || :
|
||||
|
||||
python - \
|
||||
--training_config="${WORK_DIR}/trained_models/training_parameters.yaml" \
|
||||
--train 1 \
|
||||
--restore_checkpoint 1 \
|
||||
--test_tf_nonstreaming 0 \
|
||||
--test_tflite_nonstreaming 0 \
|
||||
--test_tflite_nonstreaming_quantized 0 \
|
||||
--test_tflite_streaming 0 \
|
||||
--test_tflite_streaming_quantized 1 \
|
||||
--use_weights "best_weights" \
|
||||
mixednet \
|
||||
--pointwise_filters "64,64,64,64" \
|
||||
--repeat_in_block "1,1,1,1" \
|
||||
--mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" \
|
||||
--residual_connection "0,0,0,0" \
|
||||
--first_conv_filters 32 \
|
||||
--first_conv_kernel_size 5 \
|
||||
--stride 2 <<EOF 2>&1 | tr '\r' '\n' | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" |\
|
||||
tee "${OUTPUT_DIR}/logs/training.log" | sed -r -e '/^INFO:absl:/!d' \
|
||||
-r -e "/None|Sharding|unsupported characters|AUC|fingerprint/d" \
|
||||
-r -e 's/INFO:absl:/ /g' \
|
||||
-r -e "s/, (recall =|estimated false|average viable recall)/,\n \1/g"
|
||||
|
||||
import sys, os, gc
|
||||
import runpy
|
||||
import yaml
|
||||
print(" Loading Tensorflow")
|
||||
import tensorflow as tf
|
||||
|
||||
print(" GPU memory config")
|
||||
# Per-device memory growth (belt + suspenders)
|
||||
for g in tf.config.list_physical_devices("GPU"):
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(g, True)
|
||||
except Exception:
|
||||
pass
|
||||
print(f"INFO:absl:GPUs: {tf.config.list_physical_devices('GPU')}")
|
||||
gc.collect()
|
||||
|
||||
print()
|
||||
try:
|
||||
runpy.run_module("microwakeword.model_train_eval", run_name="__main__", alter_sys=True)
|
||||
except Exception as e:
|
||||
print(e, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
EOF
|
||||
|
||||
source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite"
|
||||
|
||||
if [ ! -f "${source_path}" ] ; then
|
||||
echo "Output model not found! Training didn't complete successfully. See ${WORK_DIR}/training.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/"
|
||||
cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/"
|
||||
cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/"
|
||||
|
||||
echo -e "\n Training complete!"
|
||||
echo " Full log: ${OUTPUT_DIR}/logs/training.log"
|
||||
|
||||
tflite_filename="${wake_word_filename}.tflite"
|
||||
tflite_path="${OUTPUT_DIR}/${tflite_filename}"
|
||||
|
||||
cp "${source_path}" "${tflite_path}"
|
||||
|
||||
# --- Write JSON metadata file with matching model name ---
|
||||
json_path="${OUTPUT_DIR}/${wake_word_filename}.json"
|
||||
cat <<-EOF > "${json_path}"
|
||||
{
|
||||
"type": "micro",
|
||||
"wake_word": "${WAKE_WORD_TITLE}",
|
||||
"author": "Tater Totterson",
|
||||
"website": "https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git",
|
||||
"model": "${tflite_filename}",
|
||||
"trained_languages": ["en"],
|
||||
"version": 2,
|
||||
"micro": {
|
||||
"probability_cutoff": 0.97,
|
||||
"sliding_window_size": 5,
|
||||
"feature_step_size": 10,
|
||||
"tensor_arena_size": 30000,
|
||||
"minimum_esphome_version": "2024.7.0"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "Name: ${WAKE_WORD_TITLE}"
|
||||
echo "Model: ${tflite_path}"
|
||||
echo "Metadata: ${json_path}"
|
||||
echo
|
||||
END_TS=$EPOCHSECONDS
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
|
||||
echo
|
||||
|
||||
Reference in New Issue
Block a user