mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
Merge pull request #18 from gtjoseph/main-cli
Train from the command line
This commit is contained in:
135
cli/.bashrc
Normal file
135
cli/.bashrc
Normal file
@@ -0,0 +1,135 @@
|
||||
# ~/.bashrc: executed by bash(1) for non-login shells.
|
||||
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
|
||||
# for examples
|
||||
|
||||
# If not running interactively, don't do anything
|
||||
[ -z "$PS1" ] && return
|
||||
|
||||
# don't put duplicate lines in the history. See bash(1) for more options
|
||||
# ... or force ignoredups and ignorespace
|
||||
HISTCONTROL=ignoredups:ignorespace
|
||||
|
||||
# append to the history file, don't overwrite it
|
||||
shopt -s histappend
|
||||
|
||||
# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
|
||||
HISTSIZE=1000
|
||||
HISTFILESIZE=2000
|
||||
|
||||
# check the window size after each command and, if necessary,
|
||||
# update the values of LINES and COLUMNS.
|
||||
shopt -s checkwinsize
|
||||
|
||||
# make less more friendly for non-text input files, see lesspipe(1)
|
||||
[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"
|
||||
|
||||
# set variable identifying the chroot you work in (used in the prompt below)
|
||||
if [ -z "$debian_chroot" ] && [ -r /etc/debian_chroot ]; then
|
||||
debian_chroot=$(cat /etc/debian_chroot)
|
||||
fi
|
||||
|
||||
# set a fancy prompt (non-color, unless we know we "want" color)
|
||||
case "$TERM" in
|
||||
xterm-color) color_prompt=yes;;
|
||||
esac
|
||||
|
||||
# uncomment for a colored prompt, if the terminal has the capability; turned
|
||||
# off by default to not distract the user: the focus in a terminal window
|
||||
# should be on the output of commands, not on the prompt
|
||||
#force_color_prompt=yes
|
||||
|
||||
if [ -n "$force_color_prompt" ]; then
|
||||
if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
|
||||
# We have color support; assume it's compliant with Ecma-48
|
||||
# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
|
||||
# a case would tend to support setf rather than setaf.)
|
||||
color_prompt=yes
|
||||
else
|
||||
color_prompt=
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$color_prompt" = yes ]; then
|
||||
PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
|
||||
else
|
||||
PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
|
||||
fi
|
||||
unset color_prompt force_color_prompt
|
||||
|
||||
# If this is an xterm set the title to user@host:dir
|
||||
case "$TERM" in
|
||||
xterm*|rxvt*)
|
||||
PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
|
||||
# enable color support of ls and also add handy aliases
|
||||
if [ -x /usr/bin/dircolors ]; then
|
||||
test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
|
||||
alias ls='ls --color=auto'
|
||||
#alias dir='dir --color=auto'
|
||||
#alias vdir='vdir --color=auto'
|
||||
|
||||
alias grep='grep --color=auto'
|
||||
alias fgrep='fgrep --color=auto'
|
||||
alias egrep='egrep --color=auto'
|
||||
fi
|
||||
|
||||
# some more ls aliases
|
||||
alias ll='ls -alF'
|
||||
alias la='ls -A'
|
||||
alias l='ls -CF'
|
||||
|
||||
# Alias definitions.
|
||||
# You may want to put all your additions into a separate file like
|
||||
# ~/.bash_aliases, instead of adding them here directly.
|
||||
# See /usr/share/doc/bash-doc/examples in the bash-doc package.
|
||||
|
||||
if [ -f ~/.bash_aliases ]; then
|
||||
. ~/.bash_aliases
|
||||
fi
|
||||
|
||||
# enable programmable completion features (you don't need to enable
|
||||
# this, if it's already enabled in /etc/bash.bashrc and /etc/profile
|
||||
# sources /etc/bash.bashrc).
|
||||
#if [ -f /etc/bash_completion ] && ! shopt -oq posix; then
|
||||
# . /etc/bash_completion
|
||||
#fi
|
||||
|
||||
if [ -f /data/.bashrc ]; then
|
||||
. /data/.bashrc
|
||||
fi
|
||||
|
||||
if ! mountpoint -q /data ; then
|
||||
cat <<-EOF >&2
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
EOF
|
||||
fi
|
||||
|
||||
if [ -d /data/.venv ]; then
|
||||
. /data/.venv/bin/activate
|
||||
else
|
||||
cat <<-EOF >&2
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run 'setup_python_venv'
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
EOF
|
||||
|
||||
fi
|
||||
alias venv='[ -d /data/.venv ] && source /data/.venv/bin/activate || echo "/data/.venv does not exist yet"'
|
||||
27
cli/Dockerfile
Normal file
27
cli/Dockerfile
Normal file
@@ -0,0 +1,27 @@
|
||||
# Since this is a pure python environment, we don't need to start
|
||||
# with a huge CUDA image. A standard Ubuntu image will do.
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_ROOT_USER_ACTION=ignore \
|
||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
||||
PATH="/root/mww-scripts:${PATH}"
|
||||
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
||||
git wget curl unzip ca-certificates nano less \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /data
|
||||
|
||||
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
||||
COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
|
||||
test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
|
||||
|
||||
# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash
|
||||
# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
|
||||
# to timeout then SIGKILL the container.
|
||||
# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
|
||||
CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
|
||||
507
cli/README.md
Normal file
507
cli/README.md
Normal file
@@ -0,0 +1,507 @@
|
||||
# Run training from the command line
|
||||
|
||||
## Overview
|
||||
|
||||
With these scripts and Dockerfile, you can train new wake words from the
|
||||
command line without using a Jupyter notebook.
|
||||
|
||||
Differences between this Docker image and the Jupyter notebook image:
|
||||
|
||||
* The Python training environment isn't included in the image. Instead, a
|
||||
"virtual environment" (venv) is created in the `/data` directory which you
|
||||
will have mounted to a host directory. This cuts about 7gb from the image
|
||||
and allows the virtualenv to persist across container instances.
|
||||
|
||||
* The logic from the Jupyter notebook is contained in individual Python
|
||||
and shell scripts
|
||||
|
||||
* No ports need to be exposed since the Jupyter notebook server isn't being
|
||||
run.
|
||||
|
||||
## TL;DR
|
||||
|
||||
For the impatient among you...
|
||||
|
||||
```shell
|
||||
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
||||
$ docker build -t microwakeword-cli:latest .
|
||||
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# setup_python_venv
|
||||
##### You have about 4 minutes to drink coffee
|
||||
|
||||
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
||||
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
||||
|
||||
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
||||
##### You have about 30-45 minutes for a nap depending on available system resources.
|
||||
##### You'll be informed of where to find your trained model.
|
||||
```
|
||||
|
||||
Load the trained model on your device and give it a try but don't be surprized
|
||||
if you get a lot of missed or false activations. Read on to find out why.
|
||||
|
||||
## Get Started
|
||||
|
||||
Good, you stuck around! Now read the rest of the document before doing
|
||||
anything.
|
||||
|
||||
### Using a GPU
|
||||
|
||||
Having an Nvidia GPU available can cut the training time by up to half. The
|
||||
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
||||
however so if you have an Nvidia GPU and want to use it for training, you'll
|
||||
need to install the official Nvidia driver from
|
||||
https://www.nvidia.com/en-in/drivers/unix/
|
||||
|
||||
### Build the image
|
||||
|
||||
You can use either Docker or Podman as your container management tool.
|
||||
`docker` is used in the examples but if you have podman, just substitute
|
||||
the command.
|
||||
|
||||
Start by navigating to the directory that contains this README file and
|
||||
the accompanying Dockerfile. Then...
|
||||
|
||||
|
||||
```shell
|
||||
docker build -t microwakeword-cli:latest .
|
||||
```
|
||||
|
||||
This should be fairly quick and result in an image that's about 320mb in size
|
||||
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
||||
|
||||
So why isn't a pre-built image available for download? Because it'll probably
|
||||
take longer to download a pre-built image than for you to create it locally.
|
||||
GitHub's container registry is notoriously erratic when it comes to download
|
||||
throughput.
|
||||
|
||||
### Create a host work directory
|
||||
|
||||
This directory will contain the Python virtual environment plus all of the
|
||||
downloaded and generated data needed for training and the final trained
|
||||
models. A full environment will need about 150gb of free space but read
|
||||
further to see how to reduce this.
|
||||
|
||||
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
||||
|
||||
The training container will start a Bash shell so if you have Bash
|
||||
aliases or Bashy things you like, create a `.bashrc` file in your
|
||||
`<host_data_dir>` and put them in there. It'll automatically be included
|
||||
any time you enter the container.
|
||||
|
||||
### Create and start the container
|
||||
|
||||
There are lots of options that control container creation. The simplest example
|
||||
will create the container and give you an interactive shell. When you exit the
|
||||
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
||||
intact.
|
||||
|
||||
```shell
|
||||
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
||||
```
|
||||
|
||||
Options:
|
||||
|
||||
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
||||
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
||||
around and give it a name for training more than one wake word. You
|
||||
can stop and remove it when you're ready.
|
||||
* Add a `-d` option to start the container in the background and use `docker
|
||||
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
||||
|
||||
When the container starts, you'll see:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run setup_python_venv
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
root@mww-cli:/#
|
||||
```
|
||||
|
||||
Don't worry about the python WARNING right now. You'll be creating the
|
||||
virtualenv in the next step.
|
||||
|
||||
If you've forgotton to create and/or mount your host data directory, you'll
|
||||
see an additional warning:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
```
|
||||
|
||||
You can certainly continue but it's a "really bad idea"™ because your
|
||||
container storage could grow from a few hundred mb to over 140gb.
|
||||
|
||||
At this point, you're in a Bash shell.
|
||||
|
||||
### Create the Python virtual environment
|
||||
|
||||
The Python virtual environment will contain all the software needed to train.
|
||||
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
||||
|
||||
The scripts that do all the work will be in the container's PATH so to setup
|
||||
the virtual environment and install all of the packages, just run:
|
||||
|
||||
```text
|
||||
setup_python_venv [ --verbose ]
|
||||
|
||||
Options:
|
||||
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
```
|
||||
|
||||
When the installation is finished, a test of the major components will be
|
||||
run.
|
||||
|
||||
Once the process is done, you should change to the `/data` directory and
|
||||
activate the virtual environment with:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data#
|
||||
```
|
||||
|
||||
Technically, you don't need to do either of these since the scripts
|
||||
are in the PATH and they know to use the `/data` directory for everything.
|
||||
It's more of an "if you're interested" thing.
|
||||
|
||||
At this point, you have a container with all software installed.
|
||||
|
||||
## Get the reference data
|
||||
|
||||
The training process itself relies on a significant amount of audio reference
|
||||
data that creates a simulated "audio environment" that your wake word will be
|
||||
trained in. These "training datasets" include things like varying amounts of
|
||||
reverberation, background music, background conversations, background noise,
|
||||
etc. All said and done, it amounts to about 30gb of audio but with the
|
||||
downloaded archives and extracted intermediate files, you'll need about 85gb
|
||||
of free space. Thankfully, you only need to download the files once no
|
||||
matter how many wake words you want to train and since it's stored in
|
||||
`/data`, you can even remove the docker container and recreate it without
|
||||
losing any of it. There are 4 datasets that are required.
|
||||
|
||||
This is a three stage process...
|
||||
|
||||
1. Download zipfiles or tarballs. (about 30gb)
|
||||
2. Extract them. (about 50gb)
|
||||
3. Convert them into the final form. (about 31gb)
|
||||
|
||||
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
||||
of the datasets doesn't need to be covnerted and is counted in both
|
||||
steps 2 and 3. You really do only need 85gb.
|
||||
|
||||
To download the archives, unpack them, and convert the audio to what's needed
|
||||
by the training process, run:
|
||||
|
||||
```text
|
||||
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
```
|
||||
|
||||
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
||||
|
||||
The script detects if the datasets have already been downloaded, extracted
|
||||
and/or converted and skips those steps as appropriate so if you've run the
|
||||
script without the cleanup options, you can just run it again with those
|
||||
options to clean them up.
|
||||
|
||||
Now you're ready to train a wake word. Almost.
|
||||
|
||||
## Train a Wake Word
|
||||
|
||||
Training is done in 3 stages.
|
||||
|
||||
1. Generate thousands of samples of the wake word with various voices,
|
||||
pitches, speeds, inflections, etc.
|
||||
2. Augment the samples with the training datasets to add background noise, etc.
|
||||
3. Run the Tensorflow training.
|
||||
|
||||
### Generate a sample for verification
|
||||
|
||||
Before you start the full process, you're going to want to generate a single
|
||||
wake word sample and play it back to ensure it sounds right. The wake word
|
||||
should be spelled phonetically to give the sample generator the best chance
|
||||
of success.
|
||||
|
||||
```text
|
||||
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
||||
===== Generating 1 sample of 'hey buster' =====
|
||||
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
||||
Successfully loaded the model
|
||||
Batch 1/0 complete
|
||||
Done
|
||||
Sample available at /data/work/test_sample/hey_buster.wav
|
||||
Play it from your host.
|
||||
```
|
||||
|
||||
You should then play that file from your host. The reason I used "hey buster"
|
||||
as the wake word is to demonstrate why it's important to generate and listen
|
||||
to a sample. If you try that exact input and play it back, you'll notice
|
||||
that the generator didn't capture the "er" at the end very well. To get it to
|
||||
do so, I had to add a period on the end as a "spacer".
|
||||
"hey buster." worked much better.
|
||||
|
||||
When you're happy with the sample, you can run the full process.
|
||||
|
||||
### Run the full training process
|
||||
|
||||
```text
|
||||
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: 20000
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: 100
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: 25000
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
```
|
||||
|
||||
By default, the training process creates 20,000 samples of your wake word and
|
||||
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
||||
in the [Extra Credit](#extra-credit) section below for
|
||||
why these are the defaults. Depending on resources available, this could take
|
||||
between 30 and 60 minutes.
|
||||
|
||||
The resulting tflite model files and logs will be placed in the
|
||||
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
||||
and will therefore be available from your host in the directory you mapped
|
||||
`/data` to. File names will have non-filename-friendly characters in your
|
||||
wake word changed to underscores to make things easier. You'll need both the
|
||||
tflite and json files to load on your device. Exactly how you load them
|
||||
depends on the device and is beyond the scope of this project.
|
||||
|
||||
The only real measure of success is how well the resulting model works
|
||||
on a real device. If you encounter too many missed or false activations,
|
||||
increasing the number of samples would probably improve the results more
|
||||
than increasing the number of training steps. See
|
||||
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
||||
|
||||
The output from the last step is filtered some by the script but still quite
|
||||
verbose. The full log will be available in the output directory as
|
||||
`training.log` if you're interested. Intepreting the log is beyond the scope
|
||||
of this project however.
|
||||
|
||||
You can train additional wake words or change the number of samples and
|
||||
training steps by simply running `train_wake_word` again. No need to repeat
|
||||
any of the earlier setup steps. If you change the wake word or the number of
|
||||
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
||||
If you only change the number of training steps, the data from the first two
|
||||
steps is still valid and only the 3rd step is run.
|
||||
|
||||
All of the intermediate data is stored in the `/data/work` directory which will
|
||||
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
||||
successfully generated and you're happy with the results, you can delete the
|
||||
`/data/work` directory.
|
||||
|
||||
### Training more than one wake word
|
||||
|
||||
Once you have a container running, you
|
||||
can easily train multiple wake words from your host:
|
||||
|
||||
```shell
|
||||
for wp in "hey_alexa" "hey_jenkins" ; do
|
||||
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
||||
done
|
||||
```
|
||||
|
||||
### Training time examples
|
||||
|
||||
Training times depend on lots of things. These are examples only.
|
||||
Your Mileage May Vary!!!
|
||||
|
||||
```text
|
||||
===============================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
||||
Augment 10000 samples Elapsed time: 0:04:05
|
||||
10000 training steps Elapsed time: 0:15:04
|
||||
==================================================
|
||||
Total Elapsed time: 0:25:26
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
||||
Augment 10000 samples Elapsed time: 0:03:40
|
||||
10000 training steps Elapsed time: 0:08:00
|
||||
======================================================
|
||||
Total Elapsed time: 0:12:09
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
||||
Augment 20000 samples Elapsed time: 0:07:04
|
||||
25000 training steps Elapsed time: 0:25:21
|
||||
======================================================
|
||||
Total Elapsed time: 0:43:03
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
||||
Augment 20000 samples Elapsed time: 0:07:05
|
||||
25000 training steps Elapsed time: 0:19:13
|
||||
======================================================
|
||||
Total Elapsed time: 0:27:11
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
||||
Augment 50000 samples Elapsed time: 0:20:22
|
||||
40000 training steps Elapsed time: 1:01:51
|
||||
==================================================
|
||||
Total Elapsed time: 1:53:00
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
||||
Augment 50000 samples Elapsed time: 0:19:13
|
||||
40000 training steps Elapsed time: 0:42:23
|
||||
======================================================
|
||||
Total Elapsed time: 1:03:44
|
||||
================================================================================
|
||||
|
||||
|
||||
```
|
||||
|
||||
The sample generation process is really the only one that uses multiple CPUs so
|
||||
having fewer CPU threads available will probably make little difference.
|
||||
|
||||
## Extra Credit
|
||||
|
||||
### Training defaults
|
||||
|
||||
If you plan on training multiple wake words, you can set your own default
|
||||
training parameters by creating a `/data/.defaults.env` file with the
|
||||
following contents:
|
||||
|
||||
```shell
|
||||
# Variable names follow the command line parameters converted to upper case
|
||||
# and with the dashes ('-') converted to underscores ('_').
|
||||
export SAMPLES=10000
|
||||
export TRAINING_STEPS=10000
|
||||
|
||||
# Don't use the GPU for any operations. Stick with the CPU only.
|
||||
##export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
```
|
||||
|
||||
### Examine your model with Tensorboard
|
||||
|
||||
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
||||
idea of how many training steps are needed before accuracy results stop
|
||||
improving. To use it, you'll have to expose port 6006 by adding `-p
|
||||
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
||||
Remember, the /data directory is mapped to a directory on your host so you
|
||||
can simply stop and delete the current container and recreate it with the new
|
||||
`docker run` command. No need to re-run any of the setup or training steps.
|
||||
|
||||
To start Tensorboard, run:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
||||
```
|
||||
|
||||
Now on your host, point your browser at `http://localhost:6006/`,
|
||||
click "SCALARS" at the top and take a look at the various charts. You'll see
|
||||
a "train" and "validation" item for each training run you've performed. It's
|
||||
the "train" items you're interested in.
|
||||
|
||||
<a id="tensorboard-results"></a>
|
||||
|
||||
You have to be a Tensorflow expert to decipher most of the charts but
|
||||
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
||||
seem to idicate that there's very little improvement after about 20,000
|
||||
training steps.
|
||||
|
||||

|
||||
|
||||
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
||||
20,000 training steps.
|
||||
|
||||

|
||||
|
||||
Given that it's faster to generate wake word samples than it is to train,
|
||||
20,000 samples and 25,000 training steps seems like a good compromise. This
|
||||
chart has a bit less smoothing to show a bit more detail and includes the
|
||||
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
||||
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
||||
25,000 are the defaults for these scripts.
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
53
cli/cudainfo
Executable file
53
cli/cudainfo
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys, glob
|
||||
|
||||
devices = glob.glob("/dev/nvidia[0-9]")
|
||||
if len(devices) == 0:
|
||||
print("CUDA not available or no CUDA-capable GPU found.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
cc_cores_per_SM_dict = {
|
||||
(2,0) : 32,
|
||||
(2,1) : 48,
|
||||
(3,0) : 192,
|
||||
(3,5) : 192,
|
||||
(3,7) : 192,
|
||||
(5,0) : 128,
|
||||
(5,2) : 128,
|
||||
(6,0) : 64,
|
||||
(6,1) : 128,
|
||||
(7,0) : 64,
|
||||
(7,5) : 64,
|
||||
(8,0) : 64,
|
||||
(8,6) : 128,
|
||||
(8,9) : 128,
|
||||
(9,0) : 128,
|
||||
(10,0) : 128,
|
||||
(12,0) : 128
|
||||
}
|
||||
|
||||
try:
|
||||
from numba import cuda
|
||||
device = cuda.get_current_device()
|
||||
ctx = cuda.current_context()
|
||||
meminfo = ctx.get_memory_info()
|
||||
compute_capability = device.compute_capability
|
||||
sms = getattr(device, 'MULTIPROCESSOR_COUNT')
|
||||
cores_per_sm = cc_cores_per_SM_dict.get(compute_capability)
|
||||
if not cores_per_sm:
|
||||
cores_per_sm = "unknown"
|
||||
total_cores = "unknown"
|
||||
else:
|
||||
total_cores = cores_per_sm * sms
|
||||
|
||||
print(f" GPU Name: {device.name if type(device.name) is str else device.name.decode()}")
|
||||
print(f" Compute Capability: {'.'.join(list(map(str, compute_capability))):>7}")
|
||||
print(f"Streaming Multiprocessors: {sms:>7}")
|
||||
print(f" CUDA Cores per SM: {cores_per_sm:>7}")
|
||||
print(f" Total CUDA Cores: {total_cores:>7}")
|
||||
print(f" Total Memory: {meminfo.total / 1024 / 1024:>7.0f} mb")
|
||||
print(f" Free Memory: {meminfo.free / 1024 / 1024:>7.0f} mb")
|
||||
except Exception as e:
|
||||
print("CUDA not available or no CUDA-capable GPU found.")
|
||||
10
cli/requirements.txt
Normal file
10
cli/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
# --- Packages needed by our scripts ---
|
||||
|
||||
numpy==1.26.4
|
||||
scipy==1.12.0
|
||||
librosa==0.10.2.post1
|
||||
soundfile==0.12.1
|
||||
tqdm==4.67.1
|
||||
scikit-learn==1.6.0
|
||||
numba==0.63.1
|
||||
PyYAML==6.0.3
|
||||
175
cli/setup_audioset
Executable file
175
cli/setup_audioset
Executable file
@@ -0,0 +1,175 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
echo "***** Checking audioset *****"
|
||||
|
||||
AUDIO_URL="https://huggingface.co/datasets/agkphysics/AudioSet/resolve"
|
||||
AUDIO_DIR="./audioset"
|
||||
mkdir -p "${AUDIO_DIR}"
|
||||
AUDIO16K_DIR="./audioset_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}"
|
||||
AUDIO_FILECOUNT="./downloads/audioset_filecount"
|
||||
AUDIO_IN_GLOB="*.flac"
|
||||
|
||||
declare -A filecounts
|
||||
for i in {0..9} ; do
|
||||
fname="bal_train0${i}.tar"
|
||||
filecounts[${fname}]=0
|
||||
done
|
||||
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
|
||||
REV_CANDIDATES=(
|
||||
"6762f044d1c88619c7f2006486036192128fb07e"
|
||||
"0049167e89f259a010c3f070fe3666d9e5242836"
|
||||
"ceb9eaaa7844c9ad7351e659c84a572e376ad06d"
|
||||
"main"
|
||||
)
|
||||
|
||||
TAR_PATTERNS=(
|
||||
"data/bal_train0"
|
||||
"data/bal_train/bal_train0"
|
||||
)
|
||||
|
||||
find_rev() {
|
||||
for rev in "${REV_CANDIDATES[@]}" ; do
|
||||
for pattern in "${TAR_PATTERNS[@]}" ; do
|
||||
url="https://huggingface.co/datasets/agkphysics/AudioSet/resolve/${rev}/${pattern}0.tar"
|
||||
curl -I -L --fail -s "${url}" > /dev/null && echo "${rev},${pattern}"
|
||||
done
|
||||
done
|
||||
echo ""
|
||||
}
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
audioset_dir = Path(sys.argv[1])
|
||||
audioset_out = Path(sys.argv[2])
|
||||
|
||||
# convert FLAC → 16k mono WAV
|
||||
flacs = list(audioset_dir.rglob("*.flac"))
|
||||
print(f" FLAC files: {len(flacs)}")
|
||||
audioset_bad = []
|
||||
ok = 0
|
||||
for p in tqdm(flacs, desc=" AudioSet→WAV (resample 16k mono)"):
|
||||
try:
|
||||
outfile = Path(audioset_out / (p.stem + ".wav"))
|
||||
if outfile.exists():
|
||||
continue
|
||||
y, _ = librosa.load(p, sr=16000, mono=True)
|
||||
if y.size == 0:
|
||||
raise ValueError("empty audio")
|
||||
write_wav(outfile, y, 16000)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
audioset_bad.append(f"{p}:{e}")
|
||||
|
||||
if audioset_bad:
|
||||
(audioset_out / "audioset_corrupted_files.log").write_text("\n".join(audioset_bad))
|
||||
print(f" AudioSet complete ({ok} ok, {len(audioset_bad)} failed)")
|
||||
EOF
|
||||
}
|
||||
|
||||
expected_filecount=$(get_total_filecount filecounts)
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing Audioset valid"
|
||||
else
|
||||
dl=$(find_rev)
|
||||
[ -n "$dl" ] || { echo " Could not locate an AudioSet revision with FLAC tarballs still present on HF." ; exit 1 ; }
|
||||
rev=${dl%%,*}
|
||||
pattern=${dl##*,}
|
||||
echo " Checking 10 tarballs"
|
||||
for i in {0..9} ; do
|
||||
fname="downloads/bal_train0${i}.tar"
|
||||
if [ ! -f "${fname}" ] ; then
|
||||
echo " Downloading bal_train0${i}.tar"
|
||||
url="${AUDIO_URL}/${rev}/${pattern}${i}.tar"
|
||||
curl -L -s --fail "${url}" -o "${fname}" || { echo "Could not fetch ${fname} at rev ${rev}; continuing." ; continue ; }
|
||||
fi
|
||||
|
||||
tarball_filecount=$(tar -tvf "${fname}" | wc -l )
|
||||
filecounts["bal_train0${i}.tar"]=${tarball_filecount}
|
||||
write_filecount=true
|
||||
|
||||
echo " Untarring bal_train0${i}.tar"
|
||||
tar -xf "${fname}" -C "${AUDIO_DIR}"
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${fname}" ] ; then
|
||||
echo " Cleaning up bal_train0${i}.tar"
|
||||
rm -rf "${fname}"
|
||||
fi
|
||||
done
|
||||
rm -rf "${AUDIO16K_DIR}/audioset_corrupted_files.log" || :
|
||||
converter
|
||||
if [ -f "${AUDIO16K_DIR}/audioset_corrupted_files.log" ] ; then
|
||||
failed=$(cat "${AUDIO16K_DIR}/audioset_corrupted_files.log" | wc -l)
|
||||
filecounts[failed]=-${failed}
|
||||
fi
|
||||
expected_filecount=$(get_total_filecount filecounts)
|
||||
actual_filecount=$(find ${AUDIO16K_DIR} -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
echo " Converted file count(${actual_filecount}) != expected file count(${expected_filecount})" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" ; then
|
||||
for i in {0..9} ; do
|
||||
fname="downloads/bal_train0${i}.tar"
|
||||
if [ -f "${fname}" ] ; then
|
||||
echo " Cleaning up bal_train0${i}.tar"
|
||||
rm -rf "${fname}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " Audioset complete"
|
||||
exit 0
|
||||
|
||||
131
cli/setup_fma
Executable file
131
cli/setup_fma
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
echo "***** Checking FMA *****"
|
||||
|
||||
AUDIO_URL="https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip"
|
||||
AUDIO_ZIPFILE="fma_xs.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="fma"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
AUDIO16K_DIR="fma_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}" || :
|
||||
AUDIO_FILECOUNT="./downloads/fma_filecount"
|
||||
AUDIO_IN_GLOB="*.mp3"
|
||||
|
||||
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
fma_dir = Path(sys.argv[1])
|
||||
fma_out = Path(sys.argv[2])
|
||||
|
||||
# convert MP3 → 16k mono WAV
|
||||
mp3s = list(fma_dir.rglob("*.mp3"))
|
||||
print(f" MP3 files: {len(mp3s)}")
|
||||
fma_bad = []
|
||||
ok = 0
|
||||
for p in tqdm(mp3s, desc=" FMA→WAV (resample 16k mono)"):
|
||||
try:
|
||||
outfile = Path(fma_out / (p.stem + ".wav"))
|
||||
if outfile.exists():
|
||||
continue
|
||||
y, _ = librosa.load(p, sr=16000, mono=True)
|
||||
if y.size == 0:
|
||||
raise ValueError("empty audio")
|
||||
write_wav(outfile, y, 16000)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
fma_bad.append(f"{p}:{e}")
|
||||
|
||||
if fma_bad:
|
||||
(fma_out / "fma_corrupted_files.log").write_text("\n".join(fma_bad))
|
||||
print(f" FMA complete ({ok} ok, {len(fma_bad)} failed)")
|
||||
EOF
|
||||
|
||||
}
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find ${AUDIO16K_DIR} -name '*.wav' 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing FMA valid"
|
||||
else
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
mkdir "${AUDIO_DIR}"
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}"
|
||||
fi
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
converter
|
||||
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " FMA complete"
|
||||
exit 0
|
||||
|
||||
124
cli/setup_mit_audio
Executable file
124
cli/setup_mit_audio
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
--cleanup-intermediate-files
|
||||
: Automatically clean up the intermediate files after they've
|
||||
: converted to 16k.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
AUDIO_URL="https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip"
|
||||
AUDIO_ZIPFILE="MIT_RIR_Audio.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="./mit_rirs"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
AUDIO16K_DIR="./mit_rirs_16k"
|
||||
mkdir -p "${AUDIO16K_DIR}" || :
|
||||
AUDIO_FILECOUNT="./downloads/mit_rir_filecount"
|
||||
AUDIO_IN_GLOB="*.wav"
|
||||
|
||||
declare -A filecounts=( [${AUDIO_ZIPFILE}]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
echo "===== Checking MIT_RIR ====="
|
||||
|
||||
converter() {
|
||||
source ${DATA_DIR}/.venv/bin/activate
|
||||
python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
|
||||
import os, sys, subprocess, scipy.io.wavfile, numpy as np
|
||||
from pathlib import Path
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
from tqdm import tqdm
|
||||
|
||||
def write_wav(dst: Path, data: np.ndarray, sr: int):
|
||||
x = np.clip(data, -1.0, 1.0)
|
||||
scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))
|
||||
|
||||
rir_in = Path(sys.argv[1])
|
||||
rir_out = Path(sys.argv[2])
|
||||
|
||||
waves = list(rir_in.rglob("*.wav"))
|
||||
try:
|
||||
print(" MIT RIR normalizing to 16k…")
|
||||
# Normalize to 16k mono
|
||||
for p in tqdm(waves, desc=" MIT_RIR (resample 16k mono)"):
|
||||
outfile = Path(rir_out / p.name)
|
||||
if outfile.exists():
|
||||
continue
|
||||
a, sr = sf.read(p, always_2d=False)
|
||||
if a.ndim > 1:
|
||||
a = a[:, 0]
|
||||
if sr != 16000:
|
||||
a, _ = librosa.load(p, sr=16000, mono=True)
|
||||
write_wav(outfile, a, 16000)
|
||||
print(" MIT RIR normalization complete")
|
||||
except Exception as e2:
|
||||
print(f" MIT RIR fallback failed: {e2}")
|
||||
raise
|
||||
EOF
|
||||
}
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || :
|
||||
write_filecount=false
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing ${AUDIO16K_DIR} valid"
|
||||
else
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
|
||||
if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${AUDIO_URL}" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -u -q -d "${AUDIO_DIR}" "${AUDIO_ZIP}"
|
||||
fi
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
converter
|
||||
actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
fi
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ]; then
|
||||
echo " Cleaning up ${AUDIO_DIR}"
|
||||
rm -rf "${AUDIO_DIR}"
|
||||
fi
|
||||
|
||||
echo " MIT_RIR complete"
|
||||
exit 0
|
||||
85
cli/setup_negative_datasets
Executable file
85
cli/setup_negative_datasets
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --cleanup-archives ] [ --data-dir=<data_dir> ]
|
||||
|
||||
--cleanup-archives : Automatically clean up any downloaded archvies after
|
||||
extraction.
|
||||
<data_dir> : Path to the data directory.
|
||||
: Default: ${DATA_DIR}
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
|
||||
cd "${DATA_DIR}/training_datasets"
|
||||
|
||||
mkdir -p ./negative_datasets || :
|
||||
|
||||
NEGATIVE_DATASET_URL="https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main"
|
||||
declare -a NEGATIVE_DATASETS=( dinner_party dinner_party_eval no_speech speech )
|
||||
AUDIO_FILECOUNT="./downloads/negative_filecount"
|
||||
|
||||
declare -A filecounts=( [dinner_party.zip]=0 [dinner_party_eval.zip]=0 [no_speech.zip]=0 [speech.zip]=0 )
|
||||
get_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
|
||||
echo "===== Checking negative datasets: ${NEGATIVE_DATASETS[*]} ====="
|
||||
write_filecount=false
|
||||
|
||||
for ds in "${NEGATIVE_DATASETS[@]}" ; do
|
||||
AUDIO_ZIPFILE="${ds}.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
AUDIO_DIR="./negative_datasets/${ds}"
|
||||
mkdir -p "${AUDIO_DIR}" || :
|
||||
|
||||
expected_filecount=${filecounts[${AUDIO_ZIPFILE}]}
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
|
||||
|
||||
if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
|
||||
echo " Existing ${ds} valid"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ ! -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Downloading ${AUDIO_ZIPFILE}"
|
||||
curl -sfL "${NEGATIVE_DATASET_URL}/${ds}.zip" -o "${AUDIO_ZIP}"
|
||||
fi
|
||||
|
||||
rm -rf "${AUDIO_DIR}" || :
|
||||
echo " Unzipping ${AUDIO_ZIPFILE}"
|
||||
unzip -q -d "./negative_datasets" "${AUDIO_ZIP}"
|
||||
actual_filecount=$(find "${AUDIO_DIR}" -name '*.ninja' 2>/dev/null | wc -l) || :
|
||||
filecounts[${AUDIO_ZIPFILE}]="${actual_filecount}"
|
||||
write_filecount=true
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
done
|
||||
|
||||
if ${write_filecount} ; then
|
||||
write_filecounts filecounts "${AUDIO_FILECOUNT}"
|
||||
fi
|
||||
|
||||
if "${CLEANUP_ARCHIVES}" ; then
|
||||
for ds in "${NEGATIVE_DATASETS[@]}" ; do
|
||||
AUDIO_ZIPFILE="${ds}.zip"
|
||||
AUDIO_ZIP="./downloads/${AUDIO_ZIPFILE}"
|
||||
if [ -f "${AUDIO_ZIP}" ] ; then
|
||||
echo " Cleaning up ${AUDIO_ZIPFILE}"
|
||||
rm -rf "${AUDIO_ZIP}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
echo " Negative datasets complete"
|
||||
|
||||
183
cli/setup_python_venv
Executable file
183
cli/setup_python_venv
Executable file
@@ -0,0 +1,183 @@
|
||||
#!/bin/bash
|
||||
PROGDIR="$(dirname $(realpath $0))"
|
||||
|
||||
KNOWN_ARGS=( data-dir python gpu no-gpu )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: setup_python_venv [ --gpu | --no-gpu ] [ --verbose ]
|
||||
|
||||
Options:
|
||||
--gpu: Install the GPU-capable versions of packages if available. This
|
||||
is the default if the script detects that a GPU is available.
|
||||
|
||||
--no-gpu: Install the non-GPU-capable versions of packages even if
|
||||
GPU-capable packages are available. This is the default if the script
|
||||
detects that a GPU is NOT available.
|
||||
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
|
||||
[ -d "${DATA_DIR}" ] || {
|
||||
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
|
||||
[ -z "${GPU}" ] && {
|
||||
GPU=false
|
||||
[ -c /dev/nvidiactl ] && {
|
||||
GPU=true
|
||||
echo " Nvidia GPU detected"
|
||||
}
|
||||
}
|
||||
|
||||
"${GPU}" || export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
VENV="${DATA_DIR}/.venv"
|
||||
[ -n "${VIRTUAL_ENV}" ] && deactivate
|
||||
|
||||
if [ -n "${PYTHON}" ] ; then
|
||||
PYTHONS=( "${PYTHON}" )
|
||||
unset PYTHON
|
||||
else
|
||||
PYTHONS=( python3.12 python3.10 )
|
||||
fi
|
||||
|
||||
for p in "${PYTHONS[@]}" ; do
|
||||
"${p}" --version &>/dev/null && { PYTHON="${p}" ; break ; }
|
||||
done
|
||||
|
||||
[ -n "${PYTHON}" ] || {
|
||||
echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -d "${VENV}" ] ; then
|
||||
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
|
||||
source "${VENV}/bin/activate" || {
|
||||
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
||||
exit 1
|
||||
}
|
||||
else
|
||||
rm -rf "${VENV}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "===== Setting up Python environment ${VENV} ====="
|
||||
|
||||
if [ -z "$VIRTUAL_ENV" ] ; then
|
||||
echo " ===== Creating new virtualenv at '${VENV}' ====="
|
||||
else
|
||||
echo " ===== Updating virtualenv at '${VENV}' ====="
|
||||
fi
|
||||
${PYTHON} -m venv --upgrade-deps "${VENV}"
|
||||
source "${VENV}/bin/activate"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
|
||||
progfiles+=( "${PROGDIR}/shell.functions" )
|
||||
|
||||
for f in "${progfiles[@]}" ; do
|
||||
ln -sfr "${f}" ".venv/bin/$(basename ${f})"
|
||||
done
|
||||
|
||||
#
|
||||
# Pip doesn't process packages from requirements.txt in
|
||||
# order but order is important because tensorflow, torch,
|
||||
# onnxruntime and micro-wake-word all depend on CUDA packages
|
||||
# at various versions. They need to be installed in this specific
|
||||
# order or they may not be able to use the GPU.
|
||||
#
|
||||
export PIP_PROGRESS_BAR=off
|
||||
export PIP_NO_COLOR=1
|
||||
export PIP_QUIET=0
|
||||
|
||||
pip_install() {
|
||||
if $VERBOSE ; then
|
||||
pip install "$@" || return 1
|
||||
else
|
||||
{ pip install "$@" || return 1 ; } | stdbuf -i0 -o0 tr -d '[:print:]' | stdbuf -i0 -o0 tr '\n' '.'
|
||||
fi
|
||||
echo
|
||||
}
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
echo " ===== Installing common requirements ====="
|
||||
pip_install -r "${PROGDIR}/requirements.txt"
|
||||
|
||||
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
|
||||
echo " ===== Installing Tensorflow${tfgpu} ====="
|
||||
pip_install ai_edge_litert "tensorflow${tfgpu}==2.20.0" "tensorboard==2.20.0" \
|
||||
"tensorboard-data-server==0.7.2"
|
||||
|
||||
${GPU} && torchgpu='--index-url https://download.pytorch.org/whl/cu129' || torchgpu=""
|
||||
echo " ===== Installing torch and torchaudio ${torchgpu:+[cuda]} ====="
|
||||
pip_install "torch==2.9.1" "torchaudio==2.9.1" ${torchgpu}
|
||||
|
||||
echo " ===== Checking microwakeword ====="
|
||||
MWW="${DATA_DIR}/tools/microWakeWord"
|
||||
if [ ! -d "${MWW}" ] || [ -n "$(git -C "${MWW}" status --porcelain)" ] ; then
|
||||
rm -rf "${MWW}" || :
|
||||
echo " Cloning micro-wake-word to ${DATA_DIR}/tools"
|
||||
git clone https://github.com/TaterTotterson/micro-wake-word "${MWW}" &>/dev/null
|
||||
fi
|
||||
echo " Installing microwakeword"
|
||||
pip_install -e "${MWW}"
|
||||
|
||||
echo " ===== Checking piper-sample-generator ====="
|
||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||
if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
|
||||
rm -rf "${PSG}" || :
|
||||
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
|
||||
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
|
||||
fi
|
||||
echo " Installing piper-sample-generator"
|
||||
pip_install -e "${PSG}"
|
||||
git -C tools/piper-sample-generator clean -fd &>/dev/null
|
||||
|
||||
MODELS_DIR="${PSG}/models"
|
||||
MODEL_NAME="en_US-libritts_r-medium.pt"
|
||||
MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}"
|
||||
MODEL_URL="https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/${MODEL_NAME}"
|
||||
if [ ! -f "${MODEL_FILE}" ] ; then
|
||||
echo " Downloading ${MODEL_NAME} for piper-sample-generator"
|
||||
curl -sfL "${MODEL_URL}" -o "${MODEL_FILE}"
|
||||
fi
|
||||
|
||||
if [ ! -f "${MODEL_FILE}.json" ] ; then
|
||||
echo " Downloading ${MODEL_NAME}.json for piper-sample-generator"
|
||||
curl -sfL "${MODEL_URL}.json" -o "${MODEL_FILE}.json"
|
||||
fi
|
||||
|
||||
${GPU} && onnxgpu='-gpu[cuda]' || onnxgpu=""
|
||||
echo " ===== Installing onnxruntime${onnxgpu} ====="
|
||||
pip_install "onnxruntime${onnxgpu}>=1.16.0"
|
||||
|
||||
echo " ===== Installing keras ====="
|
||||
# keras 3.13 has "issues" so we need to back down to 3.12.
|
||||
pip_install "keras==3.12.0"
|
||||
|
||||
${PROGDIR}/test_python --data-dir="${DATA_DIR}"
|
||||
|
||||
touch .mww-data-dir
|
||||
END_TS=$EPOCHSECONDS
|
||||
|
||||
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
|
||||
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
|
||||
|
||||
|
||||
48
cli/setup_training_datasets
Executable file
48
cli/setup_training_datasets
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
echo -e "\n===== Setting up Training Datasets =====\n"
|
||||
|
||||
${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
|
||||
END_TS=$(date +%s.%N)
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
|
||||
150
cli/shell.functions
Normal file
150
cli/shell.functions
Normal file
@@ -0,0 +1,150 @@
|
||||
|
||||
if [ "$0" == "${BASH_SOURCE[0]}" ] ; then
|
||||
echo "${BASH_SOURCE[0]} is meant to be 'sourced' not run directly" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -v DATA_DIR ] ; then
|
||||
[ -f .mww-data-dir ] && DATA_DIR="${PWD}" || DATA_DIR="/data"
|
||||
fi
|
||||
|
||||
DEFAULT_SAMPLES=20000
|
||||
DEFAULT_BATCH_SIZE=100
|
||||
DEFAULT_TRAINING_STEPS=25000
|
||||
|
||||
[ -f "${DATA_DIR}/.defaults.env" ] && source "${DATA_DIR}/.defaults.env" || :
|
||||
|
||||
: "${SAMPLES:=${DEFAULT_SAMPLES}}"
|
||||
: "${BATCH_SIZE:=${DEFAULT_BATCH_SIZE}}"
|
||||
: "${TRAINING_STEPS:=${DEFAULT_TRAINING_STEPS}}"
|
||||
: "${CLEANUP_WORK_DIR:=false}"
|
||||
: "${CLEANUP_ARCHIVES:=false}"
|
||||
: "${CLEANUP_INTERMEDIATE_FILES:=false}"
|
||||
: "${QUIET:=false}"
|
||||
: "${VERBOSE:=false}"
|
||||
|
||||
HELP=false
|
||||
|
||||
if [ -v KNOWN_ARGS ] ; then
|
||||
KNOWN_ARGS+=( help verbose quiet h v q )
|
||||
fi
|
||||
declare -gi OPTION_COUNT=0
|
||||
declare -ga POSITIONAL_ARGS=()
|
||||
declare -ga EXTRA_ARGS=()
|
||||
declare -ga UNKNOWN_ARGS=()
|
||||
declare -i __stop_parsing=0
|
||||
for a in "$@"; do
|
||||
if [ "$a" == "--" ] ; then
|
||||
__stop_parsing=1
|
||||
shift
|
||||
continue
|
||||
fi
|
||||
if [ $__stop_parsing == 1 ] ; then
|
||||
EXTRA_ARGS+=( "$a" )
|
||||
shift
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -v KNOWN_ARGS ] && [[ "${a}" =~ ^--?([^=]+)=?.* ]] ; then
|
||||
_arg=${BASH_REMATCH[1]}
|
||||
known=false
|
||||
for _k in "${KNOWN_ARGS[@]}" ; do
|
||||
[ "${_arg}" == "${_k}" ] && { known=true ; break ; } || :
|
||||
done
|
||||
$known || UNKNOWN_ARGS+=( "${a}" )
|
||||
fi
|
||||
OPTION_COUNT+=1
|
||||
case "$a" in
|
||||
-h | --help)
|
||||
HELP=true
|
||||
break
|
||||
;;
|
||||
-q | --quiet)
|
||||
QUIET=true
|
||||
break
|
||||
;;
|
||||
-v | --verbose)
|
||||
VERBOSE=true
|
||||
break
|
||||
;;
|
||||
--*=*)
|
||||
[[ $a =~ --([^=]+)=(.*) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var="${BASH_REMATCH[2]}"
|
||||
;;
|
||||
--no-*)
|
||||
[[ $a =~ --no-(.+) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var=false
|
||||
;;
|
||||
--*)
|
||||
[[ $a =~ --(.+) ]]
|
||||
l=${BASH_REMATCH[1]//-/_}
|
||||
declare -n var="${l^^}"
|
||||
var=true
|
||||
;;
|
||||
*)
|
||||
POSITIONAL_ARGS+=( "$a" )
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
print_elapsed_time() {
|
||||
print_seps=True
|
||||
if [ "$1" == "--no-separators" ] ; then
|
||||
shift
|
||||
print_seps=False
|
||||
fi
|
||||
local START_TS=${1:?"Usage: $0 <start_timestamp> <end_timestamp>"}
|
||||
local END_TS=${2:?"Usage: $0 <start_timestamp> <end_timestamp>"}
|
||||
message="${3}"
|
||||
python <<EOF
|
||||
from datetime import datetime
|
||||
st=datetime.fromtimestamp(int($START_TS))
|
||||
et=datetime.fromtimestamp(int($END_TS))
|
||||
msg=f"${message} Elapsed time: {et-st!s}"
|
||||
if ${print_seps}:
|
||||
print(f"{'=' * 80}")
|
||||
print(f"{msg:>80s}")
|
||||
if ${print_seps}:
|
||||
print(f"{'=' * 80}")
|
||||
EOF
|
||||
}
|
||||
|
||||
justify_text() {
|
||||
msg="${1:?Need a string}"
|
||||
len="${2:?Need a length}"
|
||||
printf "%*s\n" $(( (${#msg}+len)/2)) "${msg}"
|
||||
}
|
||||
|
||||
get_filecounts() {
|
||||
declare -ln fca=${1}
|
||||
local af=${2}
|
||||
if [ -f "${af}" ] ; then
|
||||
mapfile -t fc < <(cat "${af}")
|
||||
for ds in "${fc[@]}" ; do
|
||||
[[ "${ds}" =~ ^([^:]+):([0-9-]+)$ ]] && fca[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} || :
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
get_total_filecount() {
|
||||
declare -ln fca=${1}
|
||||
declare -li total=0
|
||||
for ds in "${fca[@]}" ; do
|
||||
total+=${ds}
|
||||
done
|
||||
echo $total
|
||||
}
|
||||
|
||||
write_filecounts() {
|
||||
declare -ln fca=${1}
|
||||
local af=${2}
|
||||
rm -rf "${af}" || :
|
||||
for ds in "${!fca[@]}" ; do
|
||||
echo "${ds}:${fca[${ds}]}" >> "${af}"
|
||||
done
|
||||
}
|
||||
18
cli/system_summary
Executable file
18
cli/system_summary
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
CUDA_INFO=$("${PROGDIR}/cudainfo")
|
||||
CUDA_CORES=$(sed -n -r -e "s/\s*Total\s+CUDA\s+Cores:\s+([0-9]+)$/\1/gp" <<<${CUDA_INFO})
|
||||
GPU_NAME="$(sed -n -r -e 's/\s*GPU\s+Name:\s+(.+)$/\1/gp' <<<${CUDA_INFO})"
|
||||
GPU_MEMORY="$(sed -n -r -e 's/\s*Total\s+Memory:\s*([0-9.]+).*/\1/gp' <<<${CUDA_INFO})"
|
||||
CPU_NAME="$(sed -n -r -e 's/model\s+name\s*:\s*(.+)$/\1/gp' /proc/cpuinfo | head -1)"
|
||||
CPU_CORES="$(nproc)"
|
||||
SYS_MEMORY="$(free -m | sed -n -r -e 's/Mem:\s+([0-9.]+)\s+.*/\1/gp')"
|
||||
|
||||
printf "CPU: %s (%d cores) Memory: %s mb\n" "${CPU_NAME}" "${CPU_CORES}" "${SYS_MEMORY}"
|
||||
if [ -z "${GPU_NAME}" ] ; then
|
||||
printf "GPU: N/A\n"
|
||||
else
|
||||
printf "GPU: %s (%d cores) Memory: %s mb\n" "${GPU_NAME}" "${CUDA_CORES}" "${GPU_MEMORY}"
|
||||
fi
|
||||
BIN
cli/tensorboard1.png
Normal file
BIN
cli/tensorboard1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
BIN
cli/tensorboard2.png
Normal file
BIN
cli/tensorboard2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
cli/tensorboard3.png
Normal file
BIN
cli/tensorboard3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 43 KiB |
129
cli/test_python
Executable file
129
cli/test_python
Executable file
@@ -0,0 +1,129 @@
|
||||
#!/bin/bash
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
TRAINING_STEPS=40000
|
||||
DATA_DIR=/data
|
||||
source "${PROGDIR}/shell.functions"
|
||||
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY="ERROR"
|
||||
|
||||
echo -e "\n===== Testing Python Environment =====\n"
|
||||
|
||||
echo -e "\n===== Testing Cuda =====\n"
|
||||
"${PROGDIR}/cudainfo"
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
|
||||
print("\n===== Testing Tensorflow =====\n")
|
||||
try:
|
||||
from ai_edge_litert.interpreter import Interpreter
|
||||
import tensorflow as tf
|
||||
|
||||
try:
|
||||
with tf.device("/GPU:0"):
|
||||
a = tf.random.normal([10000, 10000])
|
||||
b = tf.random.normal([10000, 10000])
|
||||
c = tf.matmul(a, b)
|
||||
if c.device.find("GPU") >= 0:
|
||||
result = "Available - " + c.device
|
||||
else:
|
||||
result = "Not available"
|
||||
except:
|
||||
result = "Not available"
|
||||
|
||||
print("GPU:", result)
|
||||
|
||||
try:
|
||||
with tf.device("/CPU:0"):
|
||||
a = tf.random.normal([10000, 10000])
|
||||
b = tf.random.normal([10000, 10000])
|
||||
c = tf.matmul(a, b)
|
||||
result = "Available - " + c.device
|
||||
except:
|
||||
result = "Not available"
|
||||
|
||||
print("CPU:", result)
|
||||
except:
|
||||
print("Tensorflow not available")
|
||||
EOF
|
||||
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
print("\n===== Testing Torch =====\n")
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print(f"GPU: Available - {torch.cuda.get_device_name(0)}")
|
||||
else:
|
||||
print("GPU:", "Not available")
|
||||
print("CPU:", "Available")
|
||||
except:
|
||||
print("Torch not available")
|
||||
EOF
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
print("\n===== Testing onnxruntime =====\n")
|
||||
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
|
||||
providers = ort.get_available_providers()
|
||||
if 'CUDAExecutionProvider' in providers:
|
||||
print("GPU:", "Available")
|
||||
else:
|
||||
print("GPU:", "Not available")
|
||||
|
||||
if 'CPUExecutionProvider' in providers:
|
||||
print("CPU:", "Available")
|
||||
else:
|
||||
print("CPU:", "Not available")
|
||||
|
||||
if 'TensorrtExecutionProvider' in providers:
|
||||
print("TensorRT:", "Available")
|
||||
else:
|
||||
print("TensorRT:", "Not available")
|
||||
except:
|
||||
print("onnxruntime not available")
|
||||
EOF
|
||||
|
||||
python - 2>/dev/null <<EOF
|
||||
import os, sys
|
||||
|
||||
print("\n===== Testing micro-wake-word =====\n")
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import librosa
|
||||
from mmap_ninja.ragged import RaggedMmap
|
||||
from microwakeword.audio.augmentation import Augmentation
|
||||
from microwakeword.audio.clips import Clips
|
||||
from microwakeword.audio.spectrograms import SpectrogramGeneration
|
||||
from microwakeword.audio.audio_utils import save_clip
|
||||
|
||||
print("micro-wake-word available")
|
||||
except:
|
||||
print("micro-wake-word not available")
|
||||
|
||||
print("")
|
||||
EOF
|
||||
|
||||
echo -e "===== Testing piper-sample-generator =====\n"
|
||||
|
||||
./tools/piper-sample-generator/generate_samples.py --help &>/dev/null && {
|
||||
echo "piper-sample-generator available"
|
||||
} || {
|
||||
echo "piper-sample-generator not available"
|
||||
}
|
||||
|
||||
echo
|
||||
echo -e "\n===== Python Environment Testing Complete =====\n"
|
||||
125
cli/train_wake_word
Executable file
125
cli/train_wake_word
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD=${POSITIONAL_ARGS[0]}
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: ${DEFAULT_SAMPLES}
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples per batch, the more memory is needed.
|
||||
Default: ${DEFAULT_BATCH_SIZE}
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: ${DEFAULT_TRAINING_STEPS}
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
mkdir -p "${DATA_DIR}/work" || :
|
||||
|
||||
[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}" || :
|
||||
|
||||
if [ ! -v WAKE_WORD_TITLE ] ; then
|
||||
declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
|
||||
WAKE_WORD_TITLE="${WWNA[*]^}"
|
||||
elif [ -z "$WAKE_WORD_TITLE" ] ; then
|
||||
WAKE_WORD_TITLE="$WAKE_WORD"
|
||||
fi
|
||||
|
||||
printf "%-80s\n" "=" | tr ' ' "="
|
||||
echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
|
||||
"${PROGDIR}/cudainfo"
|
||||
echo
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
|
||||
"${PROGDIR}/wake_word_sample_generator" \
|
||||
--samples=${SAMPLES} \
|
||||
--batch-size=${BATCH_SIZE} \
|
||||
--data-dir="${DATA_DIR}" "${WAKE_WORD}"
|
||||
|
||||
POST_GEN_TS=$EPOCHSECONDS
|
||||
|
||||
ww="${WAKE_WORD// /_}"
|
||||
ww="${ww//./}"
|
||||
|
||||
AUGMENT=false
|
||||
GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
|
||||
AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
||||
|
||||
[ -d "${AUGMENTED_DIR}" ] || AUGMENT=true
|
||||
[ "${GENERATED_DIR}/0.wav" -nt "${AUGMENTED_DIR}/testing/wakeword_mmap/data.ninja" ] && AUGMENT=true || :
|
||||
|
||||
if ${AUGMENT} ; then
|
||||
rm -rf "${AUGMENTED_DIR}" || :
|
||||
mkdir -p "${AUGMENTED_DIR}" || :
|
||||
"${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
||||
else
|
||||
echo "Augmentation not required"
|
||||
echo
|
||||
fi
|
||||
|
||||
POST_AUGMENT_TS=$EPOCHSECONDS
|
||||
|
||||
"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \
|
||||
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
||||
|
||||
if ${CLEANUP_WORK_DIR} ; then
|
||||
rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \
|
||||
"${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || :
|
||||
fi
|
||||
END_TS=$EPOCHSECONDS
|
||||
|
||||
python -c $'print(f"{\'=\' * 80}")'
|
||||
printf "%44s\n\n" "Training Summary"
|
||||
"${PROGDIR}/system_summary"
|
||||
echo
|
||||
print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
|
||||
print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
|
||||
print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps"
|
||||
python -c $'msg="="*54 ; print(f"{msg:>80s}")'
|
||||
print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total"
|
||||
python -c $'print(f"{\'=\' * 80}")'
|
||||
215
cli/wake_word_sample_augmenter
Executable file
215
cli/wake_word_sample_augmenter
Executable file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys, os, gc, glob, random
|
||||
import types, shutil, json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from argparse import ArgumentParser as ArgParser, ArgumentError
|
||||
|
||||
default_data_dir = os.getcwd() if os.path.exists(".mww-data-dir") else "/data"
|
||||
|
||||
parser = ArgParser(exit_on_error=False)
|
||||
parser.add_argument("--data-dir", type=str, help=f"Data directory. Default: {default_data_dir}", required=False, default=default_data_dir)
|
||||
parser.add_argument("--input-dir", type=str, help="Sample input directory. Default: <data-dir>/work/wake_word_samples", required=False)
|
||||
parser.add_argument("--output-dir", type=str, help="Sample output directory. Default: <input-dir>_augmented", required=False)
|
||||
parser.add_argument("--mit-rirs-16k-dir", type=str, help="MIT RIR input directory. Default: <data-dir>/training_datasets/mit_rirs_16k", required=False)
|
||||
parser.add_argument("--fma-16k-dir", type=str, help="FMA input directory. Default: <data-dir>/training_datasets/fma_16k", required=False)
|
||||
parser.add_argument("--audioset-16k-dir", type=str, help="Audioset input directory. Default: <data-dir>/training_datasets/audioset_16k", required=False)
|
||||
|
||||
try:
|
||||
args = parser.parse_args()
|
||||
except ArgumentError:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
args.data_dir = os.path.realpath(args.data_dir)
|
||||
work_dir = args.data_dir + "/work"
|
||||
|
||||
if not args.input_dir:
|
||||
args.input_dir = work_dir + "/wake_word_samples"
|
||||
else:
|
||||
args.input_dir = os.path.realpath(args.input_dir)
|
||||
|
||||
if not args.output_dir:
|
||||
args.output_dir = args.input_dir + "_augmented"
|
||||
else:
|
||||
args.output_dir = os.path.realpath(args.output_dir)
|
||||
|
||||
if not args.mit_rirs_16k_dir:
|
||||
args.mit_rirs_16k_dir = args.data_dir + "/training_datasets/mit_rirs_16k"
|
||||
else:
|
||||
args.mit_rirs_16k_dir = os.path.realpath(args.mit_rirs_16k_dir)
|
||||
|
||||
if not args.fma_16k_dir:
|
||||
args.fma_16k_dir = args.data_dir + "/training_datasets/fma_16k"
|
||||
else:
|
||||
args.fma_16k_dir = os.path.realpath(args.fma_16k_dir)
|
||||
|
||||
if not args.audioset_16k_dir:
|
||||
args.audioset_16k_dir = args.data_dir + "/training_datasets/audioset_16k"
|
||||
else:
|
||||
args.audioset_16k_dir = os.path.realpath(args.audioset_16k_dir)
|
||||
|
||||
out_path = Path(args.output_dir)
|
||||
out_path.mkdir(exist_ok=True)
|
||||
|
||||
def validate_directories(paths):
|
||||
for path in paths:
|
||||
if not os.path.exists(path):
|
||||
print(f"Error: Directory {path} does not exist. Please ensure preprocessing is complete.")
|
||||
return False
|
||||
return True
|
||||
|
||||
paths = [ work_dir, args.input_dir, args.output_dir, args.mit_rirs_16k_dir, args.fma_16k_dir, args.audioset_16k_dir ]
|
||||
if not validate_directories(paths):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
files = glob.glob(args.input_dir + "/*.wav")
|
||||
if not files:
|
||||
raise RuntimeError("❌ No WAVs in wake_word_samples.")
|
||||
max_samples = len(files)
|
||||
|
||||
print(f"\n===== Augmenting {max_samples} wake word samples =====")
|
||||
|
||||
print(" Initializing libraries")
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
|
||||
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
|
||||
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
|
||||
os.environ["TF_XLA_FLAGS"]="--tf_xla_auto_jit=0"
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"]="1"
|
||||
os.environ["TF_CUDNN_WORKSPACE_LIMIT_IN_MB"]="512"
|
||||
os.environ["GLOG_minloglevel"]="9"
|
||||
os.environ["GRPC_VERBOSITY"]="ERROR"
|
||||
|
||||
print(" Loading Tensorflow")
|
||||
import tensorflow as tf
|
||||
|
||||
print(" GPU memory config")
|
||||
# Per-device memory growth (belt + suspenders)
|
||||
for g in tf.config.list_physical_devices("GPU"):
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(g, True)
|
||||
except Exception:
|
||||
pass
|
||||
print(f" GPUs: {tf.config.list_physical_devices('GPU')}")
|
||||
gc.collect()
|
||||
|
||||
import numpy as np
|
||||
import librosa
|
||||
from mmap_ninja.ragged import RaggedMmap
|
||||
from microwakeword.audio.augmentation import Augmentation
|
||||
from microwakeword.audio.clips import Clips
|
||||
from microwakeword.audio.spectrograms import SpectrogramGeneration
|
||||
from microwakeword.audio.audio_utils import save_clip
|
||||
|
||||
START_TIME = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
|
||||
# Paths to augmented data
|
||||
impulse_paths = [ args.mit_rirs_16k_dir ]
|
||||
background_paths = [ args.fma_16k_dir, args.audioset_16k_dir]
|
||||
|
||||
clips = Clips(
|
||||
input_directory=args.input_dir,
|
||||
file_pattern='*.wav',
|
||||
max_clip_duration_s=5,
|
||||
remove_silence=True,
|
||||
random_split_seed=10,
|
||||
split_count=0.1,
|
||||
)
|
||||
|
||||
augmenter = Augmentation(
|
||||
augmentation_duration_s=3.2,
|
||||
augmentation_probabilities={
|
||||
"SevenBandParametricEQ": 0.1,
|
||||
"TanhDistortion": 0.05,
|
||||
"PitchShift": 0.15,
|
||||
"BandStopFilter": 0.1,
|
||||
"AddColorNoise": 0.1,
|
||||
"AddBackgroundNoise": 0.7,
|
||||
"Gain": 0.8,
|
||||
"RIR": 0.7,
|
||||
},
|
||||
impulse_paths=impulse_paths,
|
||||
background_paths=background_paths,
|
||||
background_min_snr_db=5,
|
||||
background_max_snr_db=10,
|
||||
min_jitter_s=0.2,
|
||||
max_jitter_s=0.3,
|
||||
)
|
||||
|
||||
# Augment samples and save the training, validation, and testing sets.
|
||||
|
||||
def audio_generator_from_wavs(self, split="train", repeat=1):
|
||||
"""
|
||||
Yield 1-D float32 arrays loaded via librosa from input_dir/*.wav.
|
||||
Deterministic 80/10/10 split with seed 10 to mirror original Clips behavior.
|
||||
"""
|
||||
files = sorted(glob.glob(args.input_dir + "/*.wav"))
|
||||
if not files:
|
||||
raise RuntimeError("❌ No WAVs in wake_word_samples.")
|
||||
|
||||
rng = random.Random(10) # deterministic shuffling like Clips(random_split_seed=10)
|
||||
files_shuf = files[:]
|
||||
rng.shuffle(files_shuf)
|
||||
|
||||
n = len(files_shuf)
|
||||
n_val = max(1, int(0.10 * n))
|
||||
n_test = max(1, int(0.10 * n))
|
||||
n_train = max(0, n - n_val - n_test)
|
||||
splits = {
|
||||
"train": files_shuf[:n_train],
|
||||
"validation": files_shuf[n_train:n_train + n_val],
|
||||
"test": files_shuf[n_train + n_val:],
|
||||
}
|
||||
file_list = splits.get(split, [])
|
||||
if not file_list:
|
||||
return # nothing to yield
|
||||
|
||||
for _ in range(max(1, int(repeat))):
|
||||
for p in file_list:
|
||||
y, sr = librosa.load(p, sr=16000, mono=True)
|
||||
yield y.astype(np.float32, copy=False)
|
||||
|
||||
# Bind the patched generator to your existing `clips` instance
|
||||
clips.audio_generator = types.MethodType(audio_generator_from_wavs, clips)
|
||||
|
||||
# ---- Split config (same as before) ----
|
||||
split_cfg = {
|
||||
"training": {"name": "train", "repetition": 2, "slide_frames": 10},
|
||||
"validation": {"name": "validation", "repetition": 1, "slide_frames": 10},
|
||||
"testing": {"name": "test", "repetition": 1, "slide_frames": 1},
|
||||
}
|
||||
|
||||
# ---- Generate features ----
|
||||
for split, cfg in split_cfg.items():
|
||||
out_dir = out_path / split
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(f" Augmenting {split}")
|
||||
|
||||
print(f" Generating spectrograms")
|
||||
spectros = SpectrogramGeneration(
|
||||
clips=clips, # now backed by our WAV loader
|
||||
augmenter=augmenter, # your existing augmenter
|
||||
slide_frames=cfg["slide_frames"],
|
||||
step_ms=10,
|
||||
)
|
||||
|
||||
print(f" Generating files")
|
||||
RaggedMmap.from_generator(
|
||||
out_dir=str(out_dir / "wakeword_mmap"),
|
||||
sample_generator=spectros.spectrogram_generator(
|
||||
split=cfg["name"], repeat=cfg["repetition"]
|
||||
),
|
||||
batch_size=100,
|
||||
verbose=False,
|
||||
)
|
||||
print(f" {split} augmentation complete")
|
||||
|
||||
END_TIME = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
et = END_TIME - START_TIME
|
||||
print(f"\n{'=' * 80}")
|
||||
msg=f"Augmented {max_samples} wake word samples."
|
||||
print(f"{msg:>50s} Elapsed time: {et!s}")
|
||||
print(f"{'=' * 80}\n")
|
||||
112
cli/wake_word_sample_generator
Executable file
112
cli/wake_word_sample_generator
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( samples batch-size data-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD="${POSITIONAL_ARGS[0]}"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --samples=<samples> ] [ --batch-size=<batch_size> ] <wake_word>
|
||||
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: ${DEFAULT_SAMPLES}
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: ${DEFAULT_BATCH_SIZE}
|
||||
|
||||
<wake_word> The word to generate samples for.
|
||||
Required.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
WORK_DIR="${DATA_DIR}/work"
|
||||
mkdir -p "${WORK_DIR}" || :
|
||||
cd "${WORK_DIR}"
|
||||
|
||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||
MODELS_DIR="${PSG}/models"
|
||||
MODEL_NAME=en_US-libritts_r-medium.pt
|
||||
MODEL_FILE="${MODELS_DIR}/${MODEL_NAME}"
|
||||
SAMPLES_DIR="${WORK_DIR}/wake_word_samples"
|
||||
|
||||
mkdir -p "${SAMPLES_DIR}" || :
|
||||
|
||||
REGENERATE=false
|
||||
|
||||
if [ "${SAMPLES}" -eq 1 ] ; then
|
||||
echo "===== Generating ${SAMPLES} sample of '${WAKE_WORD}' ====="
|
||||
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
|
||||
|
||||
mkdir -p "${WORK_DIR}/test_sample" || :
|
||||
"${PSG}/generate_samples.py" "${WAKE_WORD}" \
|
||||
--model "${MODEL_FILE}" \
|
||||
--max-samples ${SAMPLES} \
|
||||
--batch-size ${BATCH_SIZE} \
|
||||
--output-dir "${WORK_DIR}/test_sample" \
|
||||
--max-speakers 100 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g"
|
||||
mv "${WORK_DIR}/test_sample/0.wav" "${WORK_DIR}/test_sample/${wake_word_filename}.wav"
|
||||
echo "Sample available at ${WORK_DIR}/test_sample/${wake_word_filename}.wav"
|
||||
echo "Play it from your host."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
grep -q "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" "${WORK_DIR}/last_wake_word" &>/dev/null || REGENERATE=true
|
||||
|
||||
# Double check that the number of existing samples matches SAMPLES"
|
||||
existing_samples=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l)
|
||||
[ "${existing_samples}" -eq "${SAMPLES}" ] || REGENERATE=true
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
if ! ${REGENERATE} ; then
|
||||
echo "Sample generation not required"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo -e "\n===== Generating ${SAMPLES} wake word samples in batches of ${BATCH_SIZE} ====="
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
echo " Generating samples"
|
||||
rm -rf "${SAMPLES_DIR}" || :
|
||||
mkdir -p "${SAMPLES_DIR}" || :
|
||||
"${PSG}/generate_samples.py" "${WAKE_WORD}" \
|
||||
--model "${MODEL_FILE}" \
|
||||
--max-samples ${SAMPLES} \
|
||||
--batch-size ${BATCH_SIZE} \
|
||||
--output-dir "${SAMPLES_DIR}" 2>&1 | sed -r -e "s/(DEBUG|INFO):__main__:/ /g"
|
||||
|
||||
generated_files=$(find "${SAMPLES_DIR}" -name '*.wav' | wc -l)
|
||||
if [ "${generated_files}" -ne "${SAMPLES}" ] ; then
|
||||
echo "ERROR: only generated ${generated_files} files" >&2
|
||||
exit 1
|
||||
fi
|
||||
END_TS=$(date +%s.%N)
|
||||
echo "${WAKE_WORD}:${SAMPLES}:${MODEL_NAME}" > "${WORK_DIR}/last_wake_word"
|
||||
echo
|
||||
END_TS=$EPOCHSECONDS
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Generated ${SAMPLES} wake word samples."
|
||||
|
||||
exit 0
|
||||
241
cli/wake_word_sample_trainer
Executable file
241
cli/wake_word_sample_trainer
Executable file
@@ -0,0 +1,241 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
|
||||
KNOWN_ARGS=( training-steps samples data-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
WAKE_WORD="${POSITIONAL_ARGS[0]}"
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
echo "Unknown argument(s): ${UNKNOWN_ARGS[*]}" >&2
|
||||
HELP=true
|
||||
fi
|
||||
|
||||
if [ "${HELP}" == "true" ] || [ -z "${WAKE_WORD}" ] ; then
|
||||
cat <<EOF >&2
|
||||
Usage: $0 [ --samples=<samples> ] [ --training-steps=<steps> ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
$0 -h/--help
|
||||
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Used only to generate output file names.
|
||||
|
||||
--training-steps: Number of training steps.
|
||||
Default: ${DEFAULT_TRAINING_STEPS}
|
||||
|
||||
<wake_word>: The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title>: A pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized.
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORK_DIR="${DATA_DIR}/work"
|
||||
TRAINING_DS="${DATA_DIR}/training_datasets"
|
||||
|
||||
[ ${#POSITIONAL_ARGS} -eq 2 ] && WAKE_WORD_TITLE="${POSITIONAL_ARGS[1]}"
|
||||
|
||||
if [ ! -v WAKE_WORD_TITLE ] ; then
|
||||
declare -a WWNA=( ${WAKE_WORD//[^a-zA-Z0-9]/ } )
|
||||
WAKE_WORD_TITLE="${WWNA[*]^}"
|
||||
elif [ -z "$WAKE_WORD_TITLE" ] ; then
|
||||
WAKE_WORD_TITLE="$WAKE_WORD"
|
||||
fi
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "${DATA_DIR}/.venv/bin/activate"
|
||||
|
||||
check_directories() {
|
||||
for d in "$@" ; do
|
||||
[ -d "$d" ] || { echo "ERROR: Directory $d not found" >&2 ; exit 1 ; }
|
||||
done
|
||||
}
|
||||
|
||||
check_directories ${WORK_DIR}/wake_word_samples_augmented \
|
||||
${TRAINING_DS}/negative_datasets/{speech,dinner_party,no_speech,dinner_party_eval}
|
||||
|
||||
cd "${WORK_DIR}"
|
||||
|
||||
echo "===== Starting ${TRAINING_STEPS} training steps ====="
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
mkdir -p "${WORK_DIR}/trained_models" || :
|
||||
cat <<EOF >"${WORK_DIR}/trained_models/training_parameters.yaml"
|
||||
batch_size: 16
|
||||
clip_duration_ms: 1500
|
||||
eval_step_interval: 500
|
||||
features:
|
||||
- features_dir: ${WORK_DIR}/wake_word_samples_augmented
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 2.0
|
||||
truncation_strategy: truncate_start
|
||||
truth: true
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/speech
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 12.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 12.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/no_speech
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 5.0
|
||||
truncation_strategy: random
|
||||
truth: false
|
||||
type: mmap
|
||||
- features_dir: ${TRAINING_DS}/negative_datasets/dinner_party_eval
|
||||
penalty_weight: 1.0
|
||||
sampling_weight: 0.0
|
||||
truncation_strategy: split
|
||||
truth: false
|
||||
type: mmap
|
||||
freq_mask_count:
|
||||
- 0
|
||||
freq_mask_max_size:
|
||||
- 0
|
||||
learning_rates:
|
||||
- 0.001
|
||||
maximization_metric: average_viable_recall
|
||||
minimization_metric: null
|
||||
negative_class_weight:
|
||||
- 20
|
||||
positive_class_weight:
|
||||
- 1
|
||||
target_minimization: 0.9
|
||||
time_mask_count:
|
||||
- 0
|
||||
time_mask_max_size:
|
||||
- 0
|
||||
train_dir: ${WORK_DIR}/trained_models/wakeword
|
||||
training_steps:
|
||||
- ${TRAINING_STEPS}
|
||||
window_step_ms: 10
|
||||
|
||||
EOF
|
||||
|
||||
echo " Wrote training_parameters.yaml"
|
||||
rm -rf "${WORK_DIR}/trained_models/wakeword"
|
||||
|
||||
export TF_CPP_MIN_LOG_LEVEL=9
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_ALLOCATOR=cuda_malloc_async
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=0"
|
||||
export NVIDIA_TF32_OVERRIDE=1
|
||||
export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=9
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
echo " Loading Tensorflow"
|
||||
|
||||
wake_word_filename="${WAKE_WORD//[ \`~\!\$&*\(\)\{\}\[\]\|\;\'\"<>.?\/]/_}"
|
||||
OUTPUT_DIR="${DATA_DIR}/output/$(date +'%Y-%m-%d-%H-%M-%S')-${wake_word_filename}-${SAMPLES}-${TRAINING_STEPS}"
|
||||
mkdir -p "${OUTPUT_DIR}/logs" || :
|
||||
|
||||
python - \
|
||||
--training_config="${WORK_DIR}/trained_models/training_parameters.yaml" \
|
||||
--train 1 \
|
||||
--restore_checkpoint 1 \
|
||||
--test_tf_nonstreaming 0 \
|
||||
--test_tflite_nonstreaming 0 \
|
||||
--test_tflite_nonstreaming_quantized 0 \
|
||||
--test_tflite_streaming 0 \
|
||||
--test_tflite_streaming_quantized 1 \
|
||||
--use_weights "best_weights" \
|
||||
mixednet \
|
||||
--pointwise_filters "64,64,64,64" \
|
||||
--repeat_in_block "1,1,1,1" \
|
||||
--mixconv_kernel_sizes "[5], [7,11], [9,15], [23]" \
|
||||
--residual_connection "0,0,0,0" \
|
||||
--first_conv_filters 32 \
|
||||
--first_conv_kernel_size 5 \
|
||||
--stride 2 <<EOF 2>&1 | tr '\r' '\n' | stdbuf -i0 -o0 sed -r -e "/^Validation Batch/d" |\
|
||||
tee "${OUTPUT_DIR}/logs/training.log" | sed -r -e '/^INFO:absl:/!d' \
|
||||
-r -e "/None|Sharding|unsupported characters|AUC|fingerprint/d" \
|
||||
-r -e 's/INFO:absl:/ /g' \
|
||||
-r -e "s/, (recall =|estimated false|average viable recall)/,\n \1/g"
|
||||
|
||||
import sys, os, gc
|
||||
import runpy
|
||||
import yaml
|
||||
print(" Loading Tensorflow")
|
||||
import tensorflow as tf
|
||||
|
||||
print(" GPU memory config")
|
||||
# Per-device memory growth (belt + suspenders)
|
||||
for g in tf.config.list_physical_devices("GPU"):
|
||||
try:
|
||||
tf.config.experimental.set_memory_growth(g, True)
|
||||
except Exception:
|
||||
pass
|
||||
print(f"INFO:absl:GPUs: {tf.config.list_physical_devices('GPU')}")
|
||||
gc.collect()
|
||||
|
||||
print()
|
||||
try:
|
||||
runpy.run_module("microwakeword.model_train_eval", run_name="__main__", alter_sys=True)
|
||||
except Exception as e:
|
||||
print(e, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
EOF
|
||||
|
||||
source_path="${WORK_DIR}/trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite"
|
||||
|
||||
if [ ! -f "${source_path}" ] ; then
|
||||
echo "Output model not found! Training didn't complete successfully. See ${WORK_DIR}/training.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cp "${WORK_DIR}/trained_models/wakeword/model_summary.txt" "${OUTPUT_DIR}/logs/"
|
||||
cp -a "${WORK_DIR}/trained_models/wakeword/logs/train" "${OUTPUT_DIR}/logs/"
|
||||
cp -a "${WORK_DIR}/trained_models/wakeword/logs/validation" "${OUTPUT_DIR}/logs/"
|
||||
|
||||
echo -e "\n Training complete!"
|
||||
echo " Full log: ${OUTPUT_DIR}/logs/training.log"
|
||||
|
||||
tflite_filename="${wake_word_filename}.tflite"
|
||||
tflite_path="${OUTPUT_DIR}/${tflite_filename}"
|
||||
|
||||
cp "${source_path}" "${tflite_path}"
|
||||
|
||||
# --- Write JSON metadata file with matching model name ---
|
||||
json_path="${OUTPUT_DIR}/${wake_word_filename}.json"
|
||||
cat <<-EOF > "${json_path}"
|
||||
{
|
||||
"type": "micro",
|
||||
"wake_word": "${WAKE_WORD_TITLE}",
|
||||
"author": "Tater Totterson",
|
||||
"website": "https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git",
|
||||
"model": "${tflite_filename}",
|
||||
"trained_languages": ["en"],
|
||||
"version": 2,
|
||||
"micro": {
|
||||
"probability_cutoff": 0.97,
|
||||
"sliding_window_size": 5,
|
||||
"feature_step_size": 10,
|
||||
"tensor_arena_size": 30000,
|
||||
"minimum_esphome_version": "2024.7.0"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "Name: ${WAKE_WORD_TITLE}"
|
||||
echo "Model: ${tflite_path}"
|
||||
echo "Metadata: ${json_path}"
|
||||
echo
|
||||
END_TS=$EPOCHSECONDS
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training completed."
|
||||
echo
|
||||
|
||||
Reference in New Issue
Block a user