mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
cli + web recorder ui
This commit is contained in:
201
LICENSE
201
LICENSE
@@ -1,201 +0,0 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
540
README.md
540
README.md
@@ -1,123 +1,507 @@
|
||||
<div align="center">
|
||||
<img src="https://raw.githubusercontent.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker/refs/heads/main/mmw.png" alt="MicroWakeWord Trainer Logo" width="100" />
|
||||
<h1>microWakeWord Trainer Docker</h1>
|
||||
</div>
|
||||
# Run training from the command line
|
||||
|
||||
# 🥔 MicroWakeWord Trainer – Tater Approved
|
||||
## Overview
|
||||
|
||||
**✅ Tater Totterson tested & working on an NVIDIA RTX 3070 Laptop GPU (8 GB VRAM).**
|
||||
Easily train microWakeWord detection models with this pre-built Docker image and JupyterLab notebook.
|
||||
With these scripts and Dockerfile, you can train new wake words from the
|
||||
command line without using a Jupyter notebook.
|
||||
|
||||
---
|
||||
Differences between this Docker image and the Jupyter notebook image:
|
||||
|
||||
## 🚀 Quick Start
|
||||
* The Python training environment isn't included in the image. Instead, a
|
||||
"virtual environment" (venv) is created in the `/data` directory which you
|
||||
will have mounted to a host directory. This cuts about 7gb from the image
|
||||
and allows the virtualenv to persist across container instances.
|
||||
|
||||
Follow these steps to get up and running:
|
||||
* The logic from the Jupyter notebook is contained in individual Python
|
||||
and shell scripts
|
||||
|
||||
### 1️⃣ Pull the Pre-Built Docker Image
|
||||
* No ports need to be exposed since the Jupyter notebook server isn't being
|
||||
run.
|
||||
|
||||
```bash
|
||||
docker pull ghcr.io/tatertotterson/microwakeword:latest
|
||||
## TL;DR
|
||||
|
||||
For the impatient among you...
|
||||
|
||||
```shell
|
||||
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
||||
$ docker build -t microwakeword-cli:latest .
|
||||
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# setup_python_venv
|
||||
##### You have about 4 minutes to drink coffee
|
||||
|
||||
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
||||
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
||||
|
||||
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
||||
##### You have about 30-45 minutes for a nap depending on available system resources.
|
||||
##### You'll be informed of where to find your trained model.
|
||||
```
|
||||
|
||||
---
|
||||
Load the trained model on your device and give it a try but don't be surprized
|
||||
if you get a lot of missed or false activations. Read on to find out why.
|
||||
|
||||
### 2️⃣ Run the Docker Container
|
||||
## Get Started
|
||||
|
||||
```bash
|
||||
docker run --rm -it \
|
||||
--gpus all \
|
||||
-p 8888:8888 \
|
||||
-v $(pwd):/data \
|
||||
ghcr.io/tatertotterson/microwakeword:latest
|
||||
Good, you stuck around! Now read the rest of the document before doing
|
||||
anything.
|
||||
|
||||
### Using a GPU
|
||||
|
||||
Having an Nvidia GPU available can cut the training time by up to half. The
|
||||
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
||||
however so if you have an Nvidia GPU and want to use it for training, you'll
|
||||
need to install the official Nvidia driver from
|
||||
https://www.nvidia.com/en-in/drivers/unix/
|
||||
|
||||
### Build the image
|
||||
|
||||
You can use either Docker or Podman as your container management tool.
|
||||
`docker` is used in the examples but if you have podman, just substitute
|
||||
the command.
|
||||
|
||||
Start by navigating to the directory that contains this README file and
|
||||
the accompanying Dockerfile. Then...
|
||||
|
||||
|
||||
```shell
|
||||
docker build -t microwakeword-cli:latest .
|
||||
```
|
||||
|
||||
**What these flags do:**
|
||||
- `--gpus all` → Enables GPU acceleration
|
||||
- `-p 8888:8888` → Exposes JupyterLab on port 8888
|
||||
- `-v $(pwd):/data` → Saves your work in the current folder
|
||||
This should be fairly quick and result in an image that's about 320mb in size
|
||||
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
||||
|
||||
---
|
||||
So why isn't a pre-built image available for download? Because it'll probably
|
||||
take longer to download a pre-built image than for you to create it locally.
|
||||
GitHub's container registry is notoriously erratic when it comes to download
|
||||
throughput.
|
||||
|
||||
### 3️⃣ Open JupyterLab
|
||||
### Create a host work directory
|
||||
|
||||
Visit [http://localhost:8888](http://localhost:8888) in your browser — the notebook UI will open.
|
||||
This directory will contain the Python virtual environment plus all of the
|
||||
downloaded and generated data needed for training and the final trained
|
||||
models. A full environment will need about 150gb of free space but read
|
||||
further to see how to reduce this.
|
||||
|
||||
---
|
||||
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
||||
|
||||
### 4️⃣ Set Your Wake Word
|
||||
The training container will start a Bash shell so if you have Bash
|
||||
aliases or Bashy things you like, create a `.bashrc` file in your
|
||||
`<host_data_dir>` and put them in there. It'll automatically be included
|
||||
any time you enter the container.
|
||||
|
||||
At the **top of the notebook**, find this line:
|
||||
### Create and start the container
|
||||
|
||||
```bash
|
||||
TARGET_WORD = "hey_tater" # Change this to your desired wake word
|
||||
There are lots of options that control container creation. The simplest example
|
||||
will create the container and give you an interactive shell. When you exit the
|
||||
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
||||
intact.
|
||||
|
||||
```shell
|
||||
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
||||
```
|
||||
|
||||
Change `"hey_tater"` to your desired wake word (phonetic spellings often work best).
|
||||
Options:
|
||||
|
||||
---
|
||||
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
||||
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
||||
around and give it a name for training more than one wake word. You
|
||||
can stop and remove it when you're ready.
|
||||
* Add a `-d` option to start the container in the background and use `docker
|
||||
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
||||
|
||||
### 5️⃣ Run the Notebook
|
||||
When the container starts, you'll see:
|
||||
|
||||
Run all cells in the notebook. This process will:
|
||||
- Generate wake word samples
|
||||
- Train a detection model
|
||||
- Output a quantized `.tflite` model ready for on-device use
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run setup_python_venv
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
root@mww-cli:/#
|
||||
```
|
||||
|
||||
---
|
||||
Don't worry about the python WARNING right now. You'll be creating the
|
||||
virtualenv in the next step.
|
||||
|
||||
### 6️⃣ Retrieve the Trained Model & JSON
|
||||
If you've forgotton to create and/or mount your host data directory, you'll
|
||||
see an additional warning:
|
||||
|
||||
When training finishes, download links for both the `.tflite` model and its `.json` manifest will be displayed in the last cell.
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
---
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
```
|
||||
|
||||
## 🔄 Resetting to a Clean State
|
||||
You can certainly continue but it's a "really bad idea"™ because your
|
||||
container storage could grow from a few hundred mb to over 140gb.
|
||||
|
||||
If you need to start fresh:
|
||||
At this point, you're in a Bash shell.
|
||||
|
||||
1. Delete the `data` folder that was mapped to your Docker container.
|
||||
2. Restart the container using the steps above.
|
||||
3. A fresh copy of the notebook will be placed into the `data` directory.
|
||||
### Create the Python virtual environment
|
||||
|
||||
---
|
||||
The Python virtual environment will contain all the software needed to train.
|
||||
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
||||
|
||||
## 🎤 Optional: Personal Voice Samples
|
||||
The scripts that do all the work will be in the container's PATH so to setup
|
||||
the virtual environment and install all of the packages, just run:
|
||||
|
||||
In addition to synthetic TTS samples, the trainer can optionally use your own real voice recordings to significantly improve accuracy for your voice and environment.
|
||||
```text
|
||||
setup_python_venv [ --verbose ]
|
||||
|
||||
### How it works
|
||||
- If a folder named personal_samples/ exists and contains .wav files, the trainer will:
|
||||
- Automatically extract features from those recordings
|
||||
- Include them during training alongside the synthetic TTS data
|
||||
- Up-weight your personal samples during training for better real-world performance
|
||||
Options:
|
||||
|
||||
No extra flags or configuration are required — it is detected automatically.
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
### How to use it
|
||||
1. Create a folder in the repo root:
|
||||
mkdir personal_samples
|
||||
```
|
||||
|
||||
2. Record yourself saying the wake word naturally and save the files as .wav:
|
||||
personal_samples/
|
||||
hey_tater_01.wav
|
||||
hey_tater_02.wav
|
||||
hey_tater_03.wav
|
||||
...
|
||||
When the installation is finished, a test of the major components will be
|
||||
run.
|
||||
|
||||
Once the process is done, you should change to the `/data` directory and
|
||||
activate the virtual environment with:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data#
|
||||
```
|
||||
|
||||
Technically, you don't need to do either of these since the scripts
|
||||
are in the PATH and they know to use the `/data` directory for everything.
|
||||
It's more of an "if you're interested" thing.
|
||||
|
||||
At this point, you have a container with all software installed.
|
||||
|
||||
## Get the reference data
|
||||
|
||||
The training process itself relies on a significant amount of audio reference
|
||||
data that creates a simulated "audio environment" that your wake word will be
|
||||
trained in. These "training datasets" include things like varying amounts of
|
||||
reverberation, background music, background conversations, background noise,
|
||||
etc. All said and done, it amounts to about 30gb of audio but with the
|
||||
downloaded archives and extracted intermediate files, you'll need about 85gb
|
||||
of free space. Thankfully, you only need to download the files once no
|
||||
matter how many wake words you want to train and since it's stored in
|
||||
`/data`, you can even remove the docker container and recreate it without
|
||||
losing any of it. There are 4 datasets that are required.
|
||||
|
||||
This is a three stage process...
|
||||
|
||||
1. Download zipfiles or tarballs. (about 30gb)
|
||||
2. Extract them. (about 50gb)
|
||||
3. Convert them into the final form. (about 31gb)
|
||||
|
||||
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
||||
of the datasets doesn't need to be covnerted and is counted in both
|
||||
steps 2 and 3. You really do only need 85gb.
|
||||
|
||||
To download the archives, unpack them, and convert the audio to what's needed
|
||||
by the training process, run:
|
||||
|
||||
```text
|
||||
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
```
|
||||
|
||||
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
||||
|
||||
The script detects if the datasets have already been downloaded, extracted
|
||||
and/or converted and skips those steps as appropriate so if you've run the
|
||||
script without the cleanup options, you can just run it again with those
|
||||
options to clean them up.
|
||||
|
||||
Now you're ready to train a wake word. Almost.
|
||||
|
||||
## Train a Wake Word
|
||||
|
||||
Training is done in 3 stages.
|
||||
|
||||
1. Generate thousands of samples of the wake word with various voices,
|
||||
pitches, speeds, inflections, etc.
|
||||
2. Augment the samples with the training datasets to add background noise, etc.
|
||||
3. Run the Tensorflow training.
|
||||
|
||||
### Generate a sample for verification
|
||||
|
||||
Before you start the full process, you're going to want to generate a single
|
||||
wake word sample and play it back to ensure it sounds right. The wake word
|
||||
should be spelled phonetically to give the sample generator the best chance
|
||||
of success.
|
||||
|
||||
```text
|
||||
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
||||
===== Generating 1 sample of 'hey buster' =====
|
||||
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
||||
Successfully loaded the model
|
||||
Batch 1/0 complete
|
||||
Done
|
||||
Sample available at /data/work/test_sample/hey_buster.wav
|
||||
Play it from your host.
|
||||
```
|
||||
|
||||
You should then play that file from your host. The reason I used "hey buster"
|
||||
as the wake word is to demonstrate why it's important to generate and listen
|
||||
to a sample. If you try that exact input and play it back, you'll notice
|
||||
that the generator didn't capture the "er" at the end very well. To get it to
|
||||
do so, I had to add a period on the end as a "spacer".
|
||||
"hey buster." worked much better.
|
||||
|
||||
When you're happy with the sample, you can run the full process.
|
||||
|
||||
### Run the full training process
|
||||
|
||||
```text
|
||||
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: 20000
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: 100
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: 25000
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
```
|
||||
|
||||
By default, the training process creates 20,000 samples of your wake word and
|
||||
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
||||
in the [Extra Credit](#extra-credit) section below for
|
||||
why these are the defaults. Depending on resources available, this could take
|
||||
between 30 and 60 minutes.
|
||||
|
||||
The resulting tflite model files and logs will be placed in the
|
||||
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
||||
and will therefore be available from your host in the directory you mapped
|
||||
`/data` to. File names will have non-filename-friendly characters in your
|
||||
wake word changed to underscores to make things easier. You'll need both the
|
||||
tflite and json files to load on your device. Exactly how you load them
|
||||
depends on the device and is beyond the scope of this project.
|
||||
|
||||
The only real measure of success is how well the resulting model works
|
||||
on a real device. If you encounter too many missed or false activations,
|
||||
increasing the number of samples would probably improve the results more
|
||||
than increasing the number of training steps. See
|
||||
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
||||
|
||||
The output from the last step is filtered some by the script but still quite
|
||||
verbose. The full log will be available in the output directory as
|
||||
`training.log` if you're interested. Intepreting the log is beyond the scope
|
||||
of this project however.
|
||||
|
||||
You can train additional wake words or change the number of samples and
|
||||
training steps by simply running `train_wake_word` again. No need to repeat
|
||||
any of the earlier setup steps. If you change the wake word or the number of
|
||||
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
||||
If you only change the number of training steps, the data from the first two
|
||||
steps is still valid and only the 3rd step is run.
|
||||
|
||||
All of the intermediate data is stored in the `/data/work` directory which will
|
||||
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
||||
successfully generated and you're happy with the results, you can delete the
|
||||
`/data/work` directory.
|
||||
|
||||
### Training more than one wake word
|
||||
|
||||
Once you have a container running, you
|
||||
can easily train multiple wake words from your host:
|
||||
|
||||
```shell
|
||||
for wp in "hey_alexa" "hey_jenkins" ; do
|
||||
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
||||
done
|
||||
```
|
||||
|
||||
### Training time examples
|
||||
|
||||
Training times depend on lots of things. These are examples only.
|
||||
Your Mileage May Vary!!!
|
||||
|
||||
```text
|
||||
===============================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
||||
Augment 10000 samples Elapsed time: 0:04:05
|
||||
10000 training steps Elapsed time: 0:15:04
|
||||
==================================================
|
||||
Total Elapsed time: 0:25:26
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
||||
Augment 10000 samples Elapsed time: 0:03:40
|
||||
10000 training steps Elapsed time: 0:08:00
|
||||
======================================================
|
||||
Total Elapsed time: 0:12:09
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
||||
Augment 20000 samples Elapsed time: 0:07:04
|
||||
25000 training steps Elapsed time: 0:25:21
|
||||
======================================================
|
||||
Total Elapsed time: 0:43:03
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
||||
Augment 20000 samples Elapsed time: 0:07:05
|
||||
25000 training steps Elapsed time: 0:19:13
|
||||
======================================================
|
||||
Total Elapsed time: 0:27:11
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
||||
Augment 50000 samples Elapsed time: 0:20:22
|
||||
40000 training steps Elapsed time: 1:01:51
|
||||
==================================================
|
||||
Total Elapsed time: 1:53:00
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
||||
Augment 50000 samples Elapsed time: 0:19:13
|
||||
40000 training steps Elapsed time: 0:42:23
|
||||
======================================================
|
||||
Total Elapsed time: 1:03:44
|
||||
================================================================================
|
||||
|
||||
|
||||
```
|
||||
|
||||
The sample generation process is really the only one that uses multiple CPUs so
|
||||
having fewer CPU threads available will probably make little difference.
|
||||
|
||||
## Extra Credit
|
||||
|
||||
### Training defaults
|
||||
|
||||
If you plan on training multiple wake words, you can set your own default
|
||||
training parameters by creating a `/data/.defaults.env` file with the
|
||||
following contents:
|
||||
|
||||
```shell
|
||||
# Variable names follow the command line parameters converted to upper case
|
||||
# and with the dashes ('-') converted to underscores ('_').
|
||||
export SAMPLES=10000
|
||||
export TRAINING_STEPS=10000
|
||||
|
||||
# Don't use the GPU for any operations. Stick with the CPU only.
|
||||
##export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
```
|
||||
|
||||
### Examine your model with Tensorboard
|
||||
|
||||
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
||||
idea of how many training steps are needed before accuracy results stop
|
||||
improving. To use it, you'll have to expose port 6006 by adding `-p
|
||||
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
||||
Remember, the /data directory is mapped to a directory on your host so you
|
||||
can simply stop and delete the current container and recreate it with the new
|
||||
`docker run` command. No need to re-run any of the setup or training steps.
|
||||
|
||||
To start Tensorboard, run:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
||||
```
|
||||
|
||||
Now on your host, point your browser at `http://localhost:6006/`,
|
||||
click "SCALARS" at the top and take a look at the various charts. You'll see
|
||||
a "train" and "validation" item for each training run you've performed. It's
|
||||
the "train" items you're interested in.
|
||||
|
||||
<a id="tensorboard-results"></a>
|
||||
|
||||
You have to be a Tensorflow expert to decipher most of the charts but
|
||||
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
||||
seem to idicate that there's very little improvement after about 20,000
|
||||
training steps.
|
||||
|
||||

|
||||
|
||||
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
||||
20,000 training steps.
|
||||
|
||||

|
||||
|
||||
Given that it's faster to generate wake word samples than it is to train,
|
||||
20,000 samples and 25,000 training steps seems like a good compromise. This
|
||||
chart has a bit less smoothing to show a bit more detail and includes the
|
||||
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
||||
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
||||
25,000 are the defaults for these scripts.
|
||||
|
||||

|
||||
|
||||
3. Run the training script as normal:
|
||||
|
||||
If personal samples are found, you’ll see a message during training indicating they are being included.
|
||||
|
||||
### Recording tips
|
||||
- 10–30 recordings is usually enough to see a noticeable improvement
|
||||
- Vary distance, volume, and tone slightly
|
||||
- Record in the same environment where the wake word will be used (room noise matters)
|
||||
- Use 16-bit WAV files if possible (most recorders do this by default)
|
||||
|
||||
---
|
||||
|
||||
## 🙌 Credits
|
||||
|
||||
This project builds upon the excellent work of [kahrendt/microWakeWord](https://github.com/kahrendt/microWakeWord).
|
||||
Huge thanks to the original authors for their contributions to the open-source community!
|
||||
|
||||
BIN
cli/.DS_Store
vendored
Normal file
BIN
cli/.DS_Store
vendored
Normal file
Binary file not shown.
@@ -1,27 +0,0 @@
|
||||
# Since this is a pure python environment, we don't need to start
|
||||
# with a huge CUDA image. A standard Ubuntu image will do.
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_ROOT_USER_ACTION=ignore \
|
||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
||||
PATH="/root/mww-scripts:${PATH}"
|
||||
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
||||
git wget curl unzip ca-certificates nano less \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /data
|
||||
|
||||
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
||||
COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
|
||||
test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
|
||||
|
||||
# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash
|
||||
# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
|
||||
# to timeout then SIGKILL the container.
|
||||
# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
|
||||
CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
|
||||
507
cli/README.md
507
cli/README.md
@@ -1,507 +0,0 @@
|
||||
# Run training from the command line
|
||||
|
||||
## Overview
|
||||
|
||||
With these scripts and Dockerfile, you can train new wake words from the
|
||||
command line without using a Jupyter notebook.
|
||||
|
||||
Differences between this Docker image and the Jupyter notebook image:
|
||||
|
||||
* The Python training environment isn't included in the image. Instead, a
|
||||
"virtual environment" (venv) is created in the `/data` directory which you
|
||||
will have mounted to a host directory. This cuts about 7gb from the image
|
||||
and allows the virtualenv to persist across container instances.
|
||||
|
||||
* The logic from the Jupyter notebook is contained in individual Python
|
||||
and shell scripts
|
||||
|
||||
* No ports need to be exposed since the Jupyter notebook server isn't being
|
||||
run.
|
||||
|
||||
## TL;DR
|
||||
|
||||
For the impatient among you...
|
||||
|
||||
```shell
|
||||
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
||||
$ docker build -t microwakeword-cli:latest .
|
||||
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# setup_python_venv
|
||||
##### You have about 4 minutes to drink coffee
|
||||
|
||||
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
||||
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
||||
|
||||
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
||||
##### You have about 30-45 minutes for a nap depending on available system resources.
|
||||
##### You'll be informed of where to find your trained model.
|
||||
```
|
||||
|
||||
Load the trained model on your device and give it a try but don't be surprized
|
||||
if you get a lot of missed or false activations. Read on to find out why.
|
||||
|
||||
## Get Started
|
||||
|
||||
Good, you stuck around! Now read the rest of the document before doing
|
||||
anything.
|
||||
|
||||
### Using a GPU
|
||||
|
||||
Having an Nvidia GPU available can cut the training time by up to half. The
|
||||
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
||||
however so if you have an Nvidia GPU and want to use it for training, you'll
|
||||
need to install the official Nvidia driver from
|
||||
https://www.nvidia.com/en-in/drivers/unix/
|
||||
|
||||
### Build the image
|
||||
|
||||
You can use either Docker or Podman as your container management tool.
|
||||
`docker` is used in the examples but if you have podman, just substitute
|
||||
the command.
|
||||
|
||||
Start by navigating to the directory that contains this README file and
|
||||
the accompanying Dockerfile. Then...
|
||||
|
||||
|
||||
```shell
|
||||
docker build -t microwakeword-cli:latest .
|
||||
```
|
||||
|
||||
This should be fairly quick and result in an image that's about 320mb in size
|
||||
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
||||
|
||||
So why isn't a pre-built image available for download? Because it'll probably
|
||||
take longer to download a pre-built image than for you to create it locally.
|
||||
GitHub's container registry is notoriously erratic when it comes to download
|
||||
throughput.
|
||||
|
||||
### Create a host work directory
|
||||
|
||||
This directory will contain the Python virtual environment plus all of the
|
||||
downloaded and generated data needed for training and the final trained
|
||||
models. A full environment will need about 150gb of free space but read
|
||||
further to see how to reduce this.
|
||||
|
||||
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
||||
|
||||
The training container will start a Bash shell so if you have Bash
|
||||
aliases or Bashy things you like, create a `.bashrc` file in your
|
||||
`<host_data_dir>` and put them in there. It'll automatically be included
|
||||
any time you enter the container.
|
||||
|
||||
### Create and start the container
|
||||
|
||||
There are lots of options that control container creation. The simplest example
|
||||
will create the container and give you an interactive shell. When you exit the
|
||||
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
||||
intact.
|
||||
|
||||
```shell
|
||||
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
||||
```
|
||||
|
||||
Options:
|
||||
|
||||
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
||||
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
||||
around and give it a name for training more than one wake word. You
|
||||
can stop and remove it when you're ready.
|
||||
* Add a `-d` option to start the container in the background and use `docker
|
||||
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
||||
|
||||
When the container starts, you'll see:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: A python virtual environment wasn't found
|
||||
at /data/.venv. You'll need to run setup_python_venv
|
||||
before you'll be able to use this container for
|
||||
training.
|
||||
=======================================================
|
||||
root@mww-cli:/#
|
||||
```
|
||||
|
||||
Don't worry about the python WARNING right now. You'll be creating the
|
||||
virtualenv in the next step.
|
||||
|
||||
If you've forgotton to create and/or mount your host data directory, you'll
|
||||
see an additional warning:
|
||||
|
||||
```text
|
||||
=======================================================
|
||||
WARNING: The /data directory is NOT mounted.
|
||||
Running the training process without /data mounted
|
||||
could add over 140Gb of python packages and training
|
||||
files to this container's storage which is probably
|
||||
NOT what you want.
|
||||
|
||||
You should remove this container and re-create it with
|
||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||
making sure the host directory is on a device that has
|
||||
enough free space.
|
||||
=======================================================
|
||||
```
|
||||
|
||||
You can certainly continue but it's a "really bad idea"™ because your
|
||||
container storage could grow from a few hundred mb to over 140gb.
|
||||
|
||||
At this point, you're in a Bash shell.
|
||||
|
||||
### Create the Python virtual environment
|
||||
|
||||
The Python virtual environment will contain all the software needed to train.
|
||||
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
||||
|
||||
The scripts that do all the work will be in the container's PATH so to setup
|
||||
the virtual environment and install all of the packages, just run:
|
||||
|
||||
```text
|
||||
setup_python_venv [ --verbose ]
|
||||
|
||||
Options:
|
||||
|
||||
--verbose: Print the detailed "pip install" output.
|
||||
|
||||
```
|
||||
|
||||
When the installation is finished, a test of the major components will be
|
||||
run.
|
||||
|
||||
Once the process is done, you should change to the `/data` directory and
|
||||
activate the virtual environment with:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data#
|
||||
```
|
||||
|
||||
Technically, you don't need to do either of these since the scripts
|
||||
are in the PATH and they know to use the `/data` directory for everything.
|
||||
It's more of an "if you're interested" thing.
|
||||
|
||||
At this point, you have a container with all software installed.
|
||||
|
||||
## Get the reference data
|
||||
|
||||
The training process itself relies on a significant amount of audio reference
|
||||
data that creates a simulated "audio environment" that your wake word will be
|
||||
trained in. These "training datasets" include things like varying amounts of
|
||||
reverberation, background music, background conversations, background noise,
|
||||
etc. All said and done, it amounts to about 30gb of audio but with the
|
||||
downloaded archives and extracted intermediate files, you'll need about 85gb
|
||||
of free space. Thankfully, you only need to download the files once no
|
||||
matter how many wake words you want to train and since it's stored in
|
||||
`/data`, you can even remove the docker container and recreate it without
|
||||
losing any of it. There are 4 datasets that are required.
|
||||
|
||||
This is a three stage process...
|
||||
|
||||
1. Download zipfiles or tarballs. (about 30gb)
|
||||
2. Extract them. (about 50gb)
|
||||
3. Convert them into the final form. (about 31gb)
|
||||
|
||||
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
||||
of the datasets doesn't need to be covnerted and is counted in both
|
||||
steps 2 and 3. You really do only need 85gb.
|
||||
|
||||
To download the archives, unpack them, and convert the audio to what's needed
|
||||
by the training process, run:
|
||||
|
||||
```text
|
||||
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||
|
||||
Options:
|
||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||
they've been extracted.
|
||||
|
||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||
after they've been converted.
|
||||
|
||||
```
|
||||
|
||||
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
||||
|
||||
The script detects if the datasets have already been downloaded, extracted
|
||||
and/or converted and skips those steps as appropriate so if you've run the
|
||||
script without the cleanup options, you can just run it again with those
|
||||
options to clean them up.
|
||||
|
||||
Now you're ready to train a wake word. Almost.
|
||||
|
||||
## Train a Wake Word
|
||||
|
||||
Training is done in 3 stages.
|
||||
|
||||
1. Generate thousands of samples of the wake word with various voices,
|
||||
pitches, speeds, inflections, etc.
|
||||
2. Augment the samples with the training datasets to add background noise, etc.
|
||||
3. Run the Tensorflow training.
|
||||
|
||||
### Generate a sample for verification
|
||||
|
||||
Before you start the full process, you're going to want to generate a single
|
||||
wake word sample and play it back to ensure it sounds right. The wake word
|
||||
should be spelled phonetically to give the sample generator the best chance
|
||||
of success.
|
||||
|
||||
```text
|
||||
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
||||
===== Generating 1 sample of 'hey buster' =====
|
||||
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
||||
Successfully loaded the model
|
||||
Batch 1/0 complete
|
||||
Done
|
||||
Sample available at /data/work/test_sample/hey_buster.wav
|
||||
Play it from your host.
|
||||
```
|
||||
|
||||
You should then play that file from your host. The reason I used "hey buster"
|
||||
as the wake word is to demonstrate why it's important to generate and listen
|
||||
to a sample. If you try that exact input and play it back, you'll notice
|
||||
that the generator didn't capture the "er" at the end very well. To get it to
|
||||
do so, I had to add a period on the end as a "spacer".
|
||||
"hey buster." worked much better.
|
||||
|
||||
When you're happy with the sample, you can run the full process.
|
||||
|
||||
### Run the full training process
|
||||
|
||||
```text
|
||||
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||
<wake_word> [ <wake_word_title> ]
|
||||
|
||||
Options:
|
||||
--samples: The number of samples to generate for the wake word.
|
||||
Default: 20000
|
||||
|
||||
--batch-size: How many samples should be generated at a time. The more
|
||||
samples, the more memory is needed.
|
||||
Default: 100
|
||||
|
||||
--training-steps: Number of training steps. More training steps means better
|
||||
detection and false positive rates but also more time to train.
|
||||
Default: 25000
|
||||
|
||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||
Default: false
|
||||
|
||||
<wake_word> The word to train spelled phonetically.
|
||||
Required.
|
||||
|
||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||
Default: The wake word with individual words capitalized
|
||||
and punctuation removed.
|
||||
|
||||
```
|
||||
|
||||
By default, the training process creates 20,000 samples of your wake word and
|
||||
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
||||
in the [Extra Credit](#extra-credit) section below for
|
||||
why these are the defaults. Depending on resources available, this could take
|
||||
between 30 and 60 minutes.
|
||||
|
||||
The resulting tflite model files and logs will be placed in the
|
||||
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
||||
and will therefore be available from your host in the directory you mapped
|
||||
`/data` to. File names will have non-filename-friendly characters in your
|
||||
wake word changed to underscores to make things easier. You'll need both the
|
||||
tflite and json files to load on your device. Exactly how you load them
|
||||
depends on the device and is beyond the scope of this project.
|
||||
|
||||
The only real measure of success is how well the resulting model works
|
||||
on a real device. If you encounter too many missed or false activations,
|
||||
increasing the number of samples would probably improve the results more
|
||||
than increasing the number of training steps. See
|
||||
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
||||
|
||||
The output from the last step is filtered some by the script but still quite
|
||||
verbose. The full log will be available in the output directory as
|
||||
`training.log` if you're interested. Intepreting the log is beyond the scope
|
||||
of this project however.
|
||||
|
||||
You can train additional wake words or change the number of samples and
|
||||
training steps by simply running `train_wake_word` again. No need to repeat
|
||||
any of the earlier setup steps. If you change the wake word or the number of
|
||||
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
||||
If you only change the number of training steps, the data from the first two
|
||||
steps is still valid and only the 3rd step is run.
|
||||
|
||||
All of the intermediate data is stored in the `/data/work` directory which will
|
||||
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
||||
successfully generated and you're happy with the results, you can delete the
|
||||
`/data/work` directory.
|
||||
|
||||
### Training more than one wake word
|
||||
|
||||
Once you have a container running, you
|
||||
can easily train multiple wake words from your host:
|
||||
|
||||
```shell
|
||||
for wp in "hey_alexa" "hey_jenkins" ; do
|
||||
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
||||
done
|
||||
```
|
||||
|
||||
### Training time examples
|
||||
|
||||
Training times depend on lots of things. These are examples only.
|
||||
Your Mileage May Vary!!!
|
||||
|
||||
```text
|
||||
===============================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
||||
Augment 10000 samples Elapsed time: 0:04:05
|
||||
10000 training steps Elapsed time: 0:15:04
|
||||
==================================================
|
||||
Total Elapsed time: 0:25:26
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
||||
Augment 10000 samples Elapsed time: 0:03:40
|
||||
10000 training steps Elapsed time: 0:08:00
|
||||
======================================================
|
||||
Total Elapsed time: 0:12:09
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
||||
Augment 20000 samples Elapsed time: 0:07:04
|
||||
25000 training steps Elapsed time: 0:25:21
|
||||
======================================================
|
||||
Total Elapsed time: 0:43:03
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
||||
Augment 20000 samples Elapsed time: 0:07:05
|
||||
25000 training steps Elapsed time: 0:19:13
|
||||
======================================================
|
||||
Total Elapsed time: 0:27:11
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: N/A
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
||||
Augment 50000 samples Elapsed time: 0:20:22
|
||||
40000 training steps Elapsed time: 1:01:51
|
||||
==================================================
|
||||
Total Elapsed time: 1:53:00
|
||||
================================================================================
|
||||
|
||||
================================================================================
|
||||
Training Summary
|
||||
|
||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||
|
||||
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
||||
Augment 50000 samples Elapsed time: 0:19:13
|
||||
40000 training steps Elapsed time: 0:42:23
|
||||
======================================================
|
||||
Total Elapsed time: 1:03:44
|
||||
================================================================================
|
||||
|
||||
|
||||
```
|
||||
|
||||
The sample generation process is really the only one that uses multiple CPUs so
|
||||
having fewer CPU threads available will probably make little difference.
|
||||
|
||||
## Extra Credit
|
||||
|
||||
### Training defaults
|
||||
|
||||
If you plan on training multiple wake words, you can set your own default
|
||||
training parameters by creating a `/data/.defaults.env` file with the
|
||||
following contents:
|
||||
|
||||
```shell
|
||||
# Variable names follow the command line parameters converted to upper case
|
||||
# and with the dashes ('-') converted to underscores ('_').
|
||||
export SAMPLES=10000
|
||||
export TRAINING_STEPS=10000
|
||||
|
||||
# Don't use the GPU for any operations. Stick with the CPU only.
|
||||
##export CUDA_VISIBLE_DEVICES=-1
|
||||
|
||||
```
|
||||
|
||||
### Examine your model with Tensorboard
|
||||
|
||||
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
||||
idea of how many training steps are needed before accuracy results stop
|
||||
improving. To use it, you'll have to expose port 6006 by adding `-p
|
||||
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
||||
Remember, the /data directory is mapped to a directory on your host so you
|
||||
can simply stop and delete the current container and recreate it with the new
|
||||
`docker run` command. No need to re-run any of the setup or training steps.
|
||||
|
||||
To start Tensorboard, run:
|
||||
|
||||
```shell
|
||||
root@mww-cli:/# cd /data
|
||||
root@mww-cli:/data# source .venv/bin/activate
|
||||
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
||||
```
|
||||
|
||||
Now on your host, point your browser at `http://localhost:6006/`,
|
||||
click "SCALARS" at the top and take a look at the various charts. You'll see
|
||||
a "train" and "validation" item for each training run you've performed. It's
|
||||
the "train" items you're interested in.
|
||||
|
||||
<a id="tensorboard-results"></a>
|
||||
|
||||
You have to be a Tensorflow expert to decipher most of the charts but
|
||||
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
||||
seem to idicate that there's very little improvement after about 20,000
|
||||
training steps.
|
||||
|
||||

|
||||
|
||||
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
||||
20,000 training steps.
|
||||
|
||||

|
||||
|
||||
Given that it's faster to generate wake word samples than it is to train,
|
||||
20,000 samples and 25,000 training steps seems like a good compromise. This
|
||||
chart has a bit less smoothing to show a bit more detail and includes the
|
||||
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
||||
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
||||
25,000 are the defaults for these scripts.
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# --- Packages needed by our scripts ---
|
||||
|
||||
numpy==1.26.4
|
||||
scipy==1.12.0
|
||||
librosa==0.10.2.post1
|
||||
soundfile==0.12.1
|
||||
tqdm==4.67.1
|
||||
scikit-learn==1.6.0
|
||||
numba==0.63.1
|
||||
PyYAML==6.0.3
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/bin/bash
|
||||
PROGDIR="$(dirname $(realpath $0))"
|
||||
PROGDIR="$(dirname "$(realpath "$0")")"
|
||||
ROOTDIR="$(dirname "${PROGDIR}")"
|
||||
|
||||
KNOWN_ARGS=( data-dir python gpu no-gpu )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
@@ -27,7 +28,7 @@ EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
|
||||
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
|
||||
[ -d "${DATA_DIR}" ] || {
|
||||
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||
exit 1
|
||||
@@ -52,7 +53,8 @@ if [ -n "${PYTHON}" ] ; then
|
||||
PYTHONS=( "${PYTHON}" )
|
||||
unset PYTHON
|
||||
else
|
||||
PYTHONS=( python3.12 python3.10 )
|
||||
# Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04)
|
||||
PYTHONS=( python3.12 python3.11 python3.10 )
|
||||
fi
|
||||
|
||||
for p in "${PYTHONS[@]}" ; do
|
||||
@@ -60,14 +62,14 @@ for p in "${PYTHONS[@]}" ; do
|
||||
done
|
||||
|
||||
[ -n "${PYTHON}" ] || {
|
||||
echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2
|
||||
echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -d "${VENV}" ] ; then
|
||||
if [ -d "${VENV}" ] ; then
|
||||
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
|
||||
source "${VENV}/bin/activate" || {
|
||||
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
||||
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
||||
exit 1
|
||||
}
|
||||
else
|
||||
@@ -82,24 +84,28 @@ if [ -z "$VIRTUAL_ENV" ] ; then
|
||||
else
|
||||
echo " ===== Updating virtualenv at '${VENV}' ====="
|
||||
fi
|
||||
|
||||
${PYTHON} -m venv --upgrade-deps "${VENV}"
|
||||
source "${VENV}/bin/activate"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
|
||||
# Symlink CLI scripts into .venv/bin
|
||||
declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) )
|
||||
progfiles+=( "${PROGDIR}/shell.functions" )
|
||||
|
||||
# Also symlink the top-level entrypoint if present
|
||||
[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" )
|
||||
|
||||
for f in "${progfiles[@]}" ; do
|
||||
ln -sfr "${f}" ".venv/bin/$(basename ${f})"
|
||||
ln -sfr "${f}" ".venv/bin/$(basename "${f}")"
|
||||
done
|
||||
|
||||
#
|
||||
# Pip doesn't process packages from requirements.txt in
|
||||
# order but order is important because tensorflow, torch,
|
||||
# onnxruntime and micro-wake-word all depend on CUDA packages
|
||||
# at various versions. They need to be installed in this specific
|
||||
# order or they may not be able to use the GPU.
|
||||
# Pip doesn't process packages from requirements.txt in order but order is
|
||||
# important because tensorflow, torch, onnxruntime and micro-wake-word all
|
||||
# depend on CUDA packages at various versions. They need to be installed in
|
||||
# this specific order or they may not be able to use the GPU.
|
||||
#
|
||||
export PIP_PROGRESS_BAR=off
|
||||
export PIP_NO_COLOR=1
|
||||
@@ -117,7 +123,8 @@ pip_install() {
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
echo " ===== Installing common requirements ====="
|
||||
pip_install -r "${PROGDIR}/requirements.txt"
|
||||
# requirements.txt lives in repo root now
|
||||
pip_install -r "${ROOTDIR}/requirements.txt"
|
||||
|
||||
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
|
||||
echo " ===== Installing Tensorflow${tfgpu} ====="
|
||||
@@ -140,7 +147,7 @@ pip_install -e "${MWW}"
|
||||
|
||||
echo " ===== Checking piper-sample-generator ====="
|
||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||
if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
|
||||
if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then
|
||||
rm -rf "${PSG}" || :
|
||||
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
|
||||
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
|
||||
@@ -171,7 +178,7 @@ echo " ===== Installing keras ====="
|
||||
# keras 3.13 has "issues" so we need to back down to 3.12.
|
||||
pip_install "keras==3.12.0"
|
||||
|
||||
${PROGDIR}/test_python --data-dir="${DATA_DIR}"
|
||||
"${PROGDIR}/test_python" --data-dir="${DATA_DIR}"
|
||||
|
||||
touch .mww-data-dir
|
||||
END_TS=$EPOCHSECONDS
|
||||
@@ -179,5 +186,3 @@ END_TS=$EPOCHSECONDS
|
||||
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
|
||||
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
|
||||
|
||||
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
PROGPATH="$(realpath "$0")"
|
||||
PROGDIR="$(dirname "${PROGPATH}")"
|
||||
ROOTDIR="$(dirname "${PROGDIR}")" # repo root (train_wake_word, requirements.txt, etc.)
|
||||
|
||||
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
@@ -27,22 +28,38 @@ EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Normalize + validate DATA_DIR (shell.functions typically sets a default,
|
||||
# but this makes the script standalone-safe)
|
||||
[ -n "${DATA_DIR:-}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
|
||||
[ -d "${DATA_DIR}" ] || {
|
||||
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
cd "${DATA_DIR}"
|
||||
|
||||
START_TS=$EPOCHSECONDS
|
||||
echo -e "\n===== Setting up Training Datasets =====\n"
|
||||
|
||||
${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
"${PROGDIR}/setup_negative_datasets" \
|
||||
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||
--data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
"${PROGDIR}/setup_mit_audio" \
|
||||
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||
--data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
"${PROGDIR}/setup_audioset" \
|
||||
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||
--data-dir="${DATA_DIR}"
|
||||
|
||||
${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
|
||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
||||
"${PROGDIR}/setup_fma" \
|
||||
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||
--data-dir="${DATA_DIR}"
|
||||
|
||||
END_TS=$(date +%s.%N)
|
||||
END_TS=$EPOCHSECONDS
|
||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 20 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 32 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 43 KiB |
0
cli/wake_word_sample_augmenter
Executable file → Normal file
0
cli/wake_word_sample_augmenter
Executable file → Normal file
0
cli/wake_word_sample_trainer
Executable file → Normal file
0
cli/wake_word_sample_trainer
Executable file → Normal file
76
dockerfile
76
dockerfile
@@ -1,59 +1,37 @@
|
||||
# Standard Ubuntu base image. CUDA base images not needed.
|
||||
FROM ubuntu:22.04
|
||||
# Base
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_ROOT_USER_ACTION=ignore \
|
||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
||||
XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" \
|
||||
PATH="/usr/local/cuda/bin:${PATH}" \
|
||||
LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# System deps (+dev headers for building C/C++ extensions)
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.10 python3.10-venv python3.10-distutils python3.10-dev python3-pip \
|
||||
git wget curl unzip ca-certificates git-lfs \
|
||||
build-essential g++ cmake \
|
||||
libsndfile1 libsndfile1-dev libffi-dev \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
||||
git wget curl unzip ca-certificates nano less \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& mkdir -p /data
|
||||
|
||||
# Use python3.10 everywhere
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \
|
||||
&& update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
|
||||
# Recorder port
|
||||
EXPOSE 8789
|
||||
|
||||
# ---- No cuDNN repo meddling needed if using TF 2.17.x ----
|
||||
# Script root
|
||||
WORKDIR /root/mww-scripts
|
||||
|
||||
# Python deps
|
||||
# Order is important. onnxruntime, tensorflow and torch have
|
||||
# to be installed in the order below or their cuda dependencies
|
||||
# will conflict.
|
||||
COPY requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --upgrade pip \
|
||||
&& pip install "numpy==1.26.4" "cython>=0.29.36" \
|
||||
&& pip install -r /tmp/requirements.txt \
|
||||
&& pip install "onnxruntime-gpu[cuda]>=1.16.0" \
|
||||
&& pip install "tensorflow[and-cuda]==2.18.0" \
|
||||
"tensorboard==2.18.0" \
|
||||
"tensorboard-data-server==0.7.2" \
|
||||
"tensorflow-io-gcs-filesystem==0.37.1" \
|
||||
&& pip install \
|
||||
torch==2.7.1 \
|
||||
torchaudio==2.7.1 \
|
||||
--index-url https://download.pytorch.org/whl/cu128
|
||||
# Bash environment
|
||||
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
||||
|
||||
# Workspace + notebook fallback
|
||||
RUN mkdir -p /data
|
||||
WORKDIR /data
|
||||
COPY microWakeWord_training_notebook.ipynb /root/
|
||||
# Root-level entrypoints
|
||||
COPY --chown=root:root --chmod=0755 \
|
||||
train_wake_word \
|
||||
run_recorder.sh \
|
||||
recorder_server.py \
|
||||
requirements.txt \
|
||||
/root/mww-scripts/
|
||||
|
||||
# Startup script (copies default notebook if missing)
|
||||
COPY startup.sh /usr/local/bin/startup.sh
|
||||
RUN chmod +x /usr/local/bin/startup.sh
|
||||
# CLI folder (THIS IS THE IMPORTANT CHANGE)
|
||||
COPY --chown=root:root cli/ /root/mww-scripts/cli/
|
||||
|
||||
EXPOSE 8888
|
||||
# Static UI for recorder
|
||||
COPY --chown=root:root --chmod=0644 static/index.html /root/mww-scripts/static/index.html
|
||||
|
||||
CMD ["/bin/bash", "-lc", "/usr/local/bin/startup.sh && \
|
||||
exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root \
|
||||
--ServerApp.token='' --ServerApp.password='' --ServerApp.root_dir=/data"]
|
||||
# recorder server
|
||||
CMD ["/bin/bash", "-lc", "/root/mww-scripts/run_recorder.sh"]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
593
recorder_server.py
Normal file
593
recorder_server.py
Normal file
@@ -0,0 +1,593 @@
|
||||
# recorder_server.py
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
from fastapi import FastAPI, UploadFile, File, Form, Query
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
# In Docker CLI world, DATA_DIR should be /data
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", "/data")).resolve()
|
||||
|
||||
# UI files live next to this script by default
|
||||
STATIC_DIR = Path(os.environ.get("STATIC_DIR", str(ROOT_DIR / "static"))).resolve()
|
||||
|
||||
# Personal samples MUST land in /data/personal_samples for your CLI pipeline
|
||||
PERSONAL_DIR = Path(os.environ.get("PERSONAL_DIR", str(DATA_DIR / "personal_samples"))).resolve()
|
||||
|
||||
# CLI folder inside repo
|
||||
CLI_DIR = Path(os.environ.get("CLI_DIR", str(ROOT_DIR / "cli"))).resolve()
|
||||
|
||||
# If you want cleanup defaults for auto dataset setup, set these env vars:
|
||||
# REC_DATASET_CLEANUP_ARCHIVES=true/false
|
||||
# REC_DATASET_CLEANUP_INTERMEDIATE_FILES=true/false
|
||||
DATASET_CLEANUP_ARCHIVES = os.environ.get("REC_DATASET_CLEANUP_ARCHIVES", "false").lower() in ("1", "true", "yes", "y")
|
||||
DATASET_CLEANUP_INTERMEDIATE = os.environ.get("REC_DATASET_CLEANUP_INTERMEDIATE_FILES", "false").lower() in ("1", "true", "yes", "y")
|
||||
|
||||
# We want "Start training" to trigger your CLI entrypoint, using the existing venv
|
||||
# (train_wake_word should be in /data/.venv/bin via setup_python_venv)
|
||||
TRAIN_CMD = os.environ.get(
|
||||
"TRAIN_CMD",
|
||||
f"source '{DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir '{DATA_DIR}'"
|
||||
)
|
||||
|
||||
TAKES_PER_SPEAKER_DEFAULT = int(os.environ.get("REC_TAKES_PER_SPEAKER", "10"))
|
||||
SPEAKERS_TOTAL_DEFAULT = int(os.environ.get("REC_SPEAKERS_TOTAL", "1"))
|
||||
|
||||
# How many lines to show in WebUI (tail)
|
||||
TRAIN_LOG_TAIL_LINES = int(os.environ.get("REC_TRAIN_LOG_TAIL_LINES", "400"))
|
||||
# If you prefer bytes-based tailing (fast), keep this non-zero.
|
||||
TRAIN_LOG_MAX_BYTES = int(os.environ.get("REC_TRAIN_LOG_MAX_BYTES", str(512 * 1024))) # 512KB
|
||||
|
||||
app = FastAPI(title="microWakeWord Personal Recorder")
|
||||
|
||||
# Serve static UI
|
||||
STATIC_DIR.mkdir(parents=True, exist_ok=True)
|
||||
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
||||
|
||||
|
||||
def safe_name(raw: str) -> str:
|
||||
s = (raw or "").strip().lower()
|
||||
s = re.sub(r"\s+", "_", s)
|
||||
s = re.sub(r"[^a-z0-9_]+", "", s)
|
||||
s = re.sub(r"^_+|_+$", "", s)
|
||||
return s or "wakeword"
|
||||
|
||||
|
||||
# -------------------- In-memory session state --------------------
|
||||
STATE: Dict[str, Any] = {
|
||||
"raw_phrase": None,
|
||||
"safe_word": None,
|
||||
|
||||
"speakers_total": SPEAKERS_TOTAL_DEFAULT,
|
||||
"takes_per_speaker": TAKES_PER_SPEAKER_DEFAULT,
|
||||
|
||||
"takes_received": 0,
|
||||
"takes": [],
|
||||
|
||||
"training": {
|
||||
"running": False,
|
||||
"exit_code": None,
|
||||
"log_lines": [], # legacy in-memory tail (still maintained)
|
||||
"log_path": None, # path to recorder_training.log
|
||||
"safe_word": None,
|
||||
|
||||
# NEW: byte offset for efficient log tailing
|
||||
"log_offset": 0,
|
||||
},
|
||||
}
|
||||
|
||||
STATE_LOCK = threading.Lock()
|
||||
|
||||
|
||||
def _reset_personal_samples_dir():
|
||||
PERSONAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for p in PERSONAL_DIR.glob("*.wav"):
|
||||
try:
|
||||
p.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _append_train_log(line: str):
|
||||
line = (line or "").rstrip("\n")
|
||||
with STATE_LOCK:
|
||||
buf: List[str] = STATE["training"]["log_lines"]
|
||||
buf.append(line)
|
||||
if len(buf) > 250:
|
||||
del buf[: (len(buf) - 250)]
|
||||
|
||||
|
||||
def _title_from_phrase(raw_phrase: str) -> str:
|
||||
# Keep it human-friendly for the optional <wake_word_title> argument
|
||||
s = re.sub(r"[^a-zA-Z0-9 ]+", " ", raw_phrase or "").strip()
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s.title() if s else ""
|
||||
|
||||
|
||||
def _run_streamed(
|
||||
cmd: List[str],
|
||||
cwd: Path,
|
||||
log_path: Path,
|
||||
header: Optional[str] = None,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Run a command streaming stdout/stderr to both:
|
||||
- recorder_training.log (disk)
|
||||
- STATE["training"]["log_lines"] (UI) [best-effort]
|
||||
Returns process exit code.
|
||||
"""
|
||||
if header:
|
||||
_append_train_log(header)
|
||||
|
||||
_append_train_log("→ " + " ".join(cmd))
|
||||
|
||||
with open(log_path, "a", encoding="utf-8") as lf:
|
||||
lf.write("\n" + ("=" * 80) + "\n")
|
||||
if header:
|
||||
lf.write(header + "\n")
|
||||
lf.write("→ " + " ".join(cmd) + "\n")
|
||||
lf.flush()
|
||||
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=str(cwd),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
|
||||
assert proc.stdout is not None
|
||||
for line in proc.stdout:
|
||||
lf.write(line)
|
||||
lf.flush()
|
||||
_append_train_log(line)
|
||||
|
||||
return proc.wait()
|
||||
|
||||
|
||||
def _ensure_training_venv(log_path: Path) -> None:
|
||||
"""
|
||||
Ensure /data/.venv exists by running cli/setup_python_venv if needed.
|
||||
"""
|
||||
activate = DATA_DIR / ".venv" / "bin" / "activate"
|
||||
if activate.exists():
|
||||
_append_train_log("✅ Training venv found (skipping setup_python_venv)")
|
||||
return
|
||||
|
||||
setup = CLI_DIR / "setup_python_venv"
|
||||
if not setup.exists():
|
||||
raise RuntimeError(f"Missing setup_python_venv at: {setup}")
|
||||
|
||||
rc = _run_streamed(
|
||||
["bash", "-lc", f"cd '{DATA_DIR}' && '{setup}' --data-dir='{DATA_DIR}'"],
|
||||
cwd=DATA_DIR,
|
||||
log_path=log_path,
|
||||
header="===== Ensuring Python venv (/data/.venv) =====",
|
||||
)
|
||||
|
||||
if rc != 0:
|
||||
raise RuntimeError(f"setup_python_venv failed (exit_code={rc})")
|
||||
|
||||
if not activate.exists():
|
||||
raise RuntimeError(f"setup_python_venv finished, but {activate} is still missing")
|
||||
|
||||
|
||||
def _ensure_training_datasets(log_path: Path) -> None:
|
||||
"""
|
||||
Always run setup_training_datasets before training.
|
||||
The underlying scripts should skip work when already done.
|
||||
"""
|
||||
setup = CLI_DIR / "setup_training_datasets"
|
||||
if not setup.exists():
|
||||
raise RuntimeError(f"Missing setup_training_datasets at: {setup}")
|
||||
|
||||
cleanup_arch = "true" if DATASET_CLEANUP_ARCHIVES else "false"
|
||||
cleanup_inter = "true" if DATASET_CLEANUP_INTERMEDIATE else "false"
|
||||
|
||||
cmd = [
|
||||
"bash",
|
||||
"-lc",
|
||||
(
|
||||
f"cd '{DATA_DIR}' && "
|
||||
f"'{setup}' "
|
||||
f"--cleanup-archives='{cleanup_arch}' "
|
||||
f"--cleanup-intermediate-files='{cleanup_inter}' "
|
||||
f"--data-dir='{DATA_DIR}'"
|
||||
),
|
||||
]
|
||||
|
||||
rc = _run_streamed(
|
||||
cmd,
|
||||
cwd=DATA_DIR,
|
||||
log_path=log_path,
|
||||
header="===== Ensuring training datasets (setup_training_datasets) =====",
|
||||
)
|
||||
|
||||
if rc != 0:
|
||||
raise RuntimeError(f"setup_training_datasets failed (exit_code={rc})")
|
||||
|
||||
|
||||
def _read_log_tail_by_bytes(log_path: Path, max_bytes: int) -> str:
|
||||
"""
|
||||
Read up to the last max_bytes from a file (UTF-8 best effort).
|
||||
"""
|
||||
if not log_path.exists():
|
||||
return ""
|
||||
|
||||
try:
|
||||
size = log_path.stat().st_size
|
||||
start = max(0, size - max_bytes)
|
||||
with open(log_path, "rb") as f:
|
||||
f.seek(start)
|
||||
data = f.read()
|
||||
# If we started in the middle of a line, it's ok; UI will show partial.
|
||||
return data.decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _read_log_tail_by_lines(log_path: Path, max_lines: int) -> str:
|
||||
"""
|
||||
Read last N lines of a file (simple, may be slower on huge files).
|
||||
"""
|
||||
if not log_path.exists():
|
||||
return ""
|
||||
try:
|
||||
# Read by bytes limited first, then line-tail
|
||||
raw = _read_log_tail_by_bytes(log_path, TRAIN_LOG_MAX_BYTES)
|
||||
if not raw:
|
||||
return ""
|
||||
lines = raw.splitlines()
|
||||
if len(lines) <= max_lines:
|
||||
return "\n".join(lines)
|
||||
return "\n".join(lines[-max_lines:])
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _read_log_since_offset(log_path: Path, offset: int, max_bytes: int = 256 * 1024) -> Tuple[str, int]:
|
||||
"""
|
||||
Read log file incrementally starting from `offset`.
|
||||
Returns (new_text, new_offset). Caps bytes read per call.
|
||||
"""
|
||||
if not log_path.exists():
|
||||
return ("", offset)
|
||||
|
||||
try:
|
||||
size = log_path.stat().st_size
|
||||
# If file rotated/truncated, reset offset
|
||||
if offset > size:
|
||||
offset = 0
|
||||
|
||||
with open(log_path, "rb") as f:
|
||||
f.seek(offset)
|
||||
data = f.read(max_bytes)
|
||||
|
||||
new_offset = offset + len(data)
|
||||
text = data.decode("utf-8", errors="replace")
|
||||
return (text, new_offset)
|
||||
except Exception:
|
||||
return ("", offset)
|
||||
|
||||
|
||||
def _run_training_background(safe_word: str, allow_no_personal: bool):
|
||||
with STATE_LOCK:
|
||||
raw_phrase = STATE.get("raw_phrase") or ""
|
||||
|
||||
wake_word_title = _title_from_phrase(raw_phrase)
|
||||
|
||||
with STATE_LOCK:
|
||||
STATE["training"]["running"] = True
|
||||
STATE["training"]["exit_code"] = None
|
||||
STATE["training"]["log_lines"] = []
|
||||
STATE["training"]["safe_word"] = safe_word
|
||||
log_path = Path(str(DATA_DIR / "recorder_training.log"))
|
||||
STATE["training"]["log_path"] = str(log_path)
|
||||
STATE["training"]["log_offset"] = 0
|
||||
|
||||
# fresh header at the start of a run
|
||||
_append_train_log("================================================================================")
|
||||
_append_train_log("===== Recorder Training Run =====")
|
||||
_append_train_log("================================================================================")
|
||||
|
||||
# Ensure the log exists and starts cleanly with a header separator for this run
|
||||
try:
|
||||
with open(log_path, "a", encoding="utf-8") as lf:
|
||||
lf.write("\n" + ("=" * 80) + "\n")
|
||||
lf.write("===== Recorder Training Run =====\n")
|
||||
lf.write(("=" * 80) + "\n")
|
||||
lf.flush()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# 1) Ensure venv (auto-installs)
|
||||
_ensure_training_venv(log_path)
|
||||
|
||||
# 2) Ensure datasets (auto-installs / skips if already present)
|
||||
_ensure_training_datasets(log_path)
|
||||
|
||||
# 3) Run training
|
||||
if wake_word_title:
|
||||
cmd_str = f"{TRAIN_CMD} '{safe_word}' '{wake_word_title}'"
|
||||
else:
|
||||
cmd_str = f"{TRAIN_CMD} '{safe_word}'"
|
||||
|
||||
env = os.environ.copy()
|
||||
env["MWW_ALLOW_NO_PERSONAL"] = "true" if allow_no_personal else "false"
|
||||
|
||||
_append_train_log("===== Training (train_wake_word) =====")
|
||||
_append_train_log(f"→ Running: {cmd_str}")
|
||||
|
||||
with open(log_path, "a", encoding="utf-8") as lf:
|
||||
proc = subprocess.Popen(
|
||||
["bash", "-lc", cmd_str],
|
||||
cwd=str(DATA_DIR),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
assert proc.stdout is not None
|
||||
for line in proc.stdout:
|
||||
lf.write(line)
|
||||
lf.flush()
|
||||
_append_train_log(line)
|
||||
|
||||
rc = proc.wait()
|
||||
|
||||
_append_train_log(f"✓ Training finished (exit_code={rc})")
|
||||
with STATE_LOCK:
|
||||
STATE["training"]["exit_code"] = rc
|
||||
|
||||
except Exception as e:
|
||||
_append_train_log(f"✗ Training crashed: {e!r}")
|
||||
with STATE_LOCK:
|
||||
STATE["training"]["exit_code"] = 999
|
||||
|
||||
finally:
|
||||
with STATE_LOCK:
|
||||
STATE["training"]["running"] = False
|
||||
|
||||
|
||||
# -------------------- Routes --------------------
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
def index():
|
||||
html_path = STATIC_DIR / "index.html"
|
||||
if not html_path.exists():
|
||||
return HTMLResponse(
|
||||
"<h3>Missing UI</h3><p>Create <code>static/index.html</code>.</p>",
|
||||
status_code=500,
|
||||
)
|
||||
return HTMLResponse(html_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
@app.post("/api/start_session")
|
||||
def start_session(payload: Dict[str, Any]):
|
||||
raw = (payload.get("phrase") or "").strip()
|
||||
if not raw:
|
||||
return JSONResponse({"ok": False, "error": "phrase is required"}, status_code=400)
|
||||
|
||||
safe = safe_name(raw)
|
||||
|
||||
speakers_total = int(payload.get("speakers_total") or SPEAKERS_TOTAL_DEFAULT)
|
||||
takes_per_speaker = int(payload.get("takes_per_speaker") or TAKES_PER_SPEAKER_DEFAULT)
|
||||
|
||||
speakers_total = max(1, min(10, speakers_total))
|
||||
takes_per_speaker = max(1, min(50, takes_per_speaker))
|
||||
|
||||
with STATE_LOCK:
|
||||
STATE["raw_phrase"] = raw
|
||||
STATE["safe_word"] = safe
|
||||
STATE["speakers_total"] = speakers_total
|
||||
STATE["takes_per_speaker"] = takes_per_speaker
|
||||
STATE["takes_received"] = 0
|
||||
STATE["takes"] = []
|
||||
# do not interrupt training if running
|
||||
|
||||
_reset_personal_samples_dir()
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"raw_phrase": raw,
|
||||
"safe_word": safe,
|
||||
"speakers_total": speakers_total,
|
||||
"takes_per_speaker": takes_per_speaker,
|
||||
"takes_total": speakers_total * takes_per_speaker,
|
||||
"personal_dir": str(PERSONAL_DIR),
|
||||
"data_dir": str(DATA_DIR),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/session")
|
||||
def get_session():
|
||||
with STATE_LOCK:
|
||||
return {
|
||||
"ok": True,
|
||||
"raw_phrase": STATE["raw_phrase"],
|
||||
"safe_word": STATE["safe_word"],
|
||||
"speakers_total": STATE["speakers_total"],
|
||||
"takes_per_speaker": STATE["takes_per_speaker"],
|
||||
"takes_received": STATE["takes_received"],
|
||||
"takes": list(STATE["takes"]),
|
||||
"training": dict(STATE["training"]),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/upload_take")
|
||||
async def upload_take(
|
||||
speaker_index: int = Form(...),
|
||||
take_index: int = Form(...),
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
with STATE_LOCK:
|
||||
safe_word = STATE["safe_word"]
|
||||
speakers_total = int(STATE["speakers_total"])
|
||||
takes_per_speaker = int(STATE["takes_per_speaker"])
|
||||
|
||||
if not safe_word:
|
||||
return JSONResponse({"ok": False, "error": "No active session. Call /api/start_session first."}, status_code=400)
|
||||
|
||||
if speaker_index < 1 or speaker_index > speakers_total:
|
||||
return JSONResponse({"ok": False, "error": f"speaker_index must be 1..{speakers_total}"}, status_code=400)
|
||||
|
||||
if take_index < 1 or take_index > takes_per_speaker:
|
||||
return JSONResponse({"ok": False, "error": f"take_index must be 1..{takes_per_speaker}"}, status_code=400)
|
||||
|
||||
PERSONAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_name = f"speaker{speaker_index:02d}_take{take_index:02d}.wav"
|
||||
out_path = PERSONAL_DIR / out_name
|
||||
|
||||
data = await file.read()
|
||||
if not data or len(data) < 44:
|
||||
return JSONResponse({"ok": False, "error": "Empty/invalid file"}, status_code=400)
|
||||
|
||||
out_path.write_bytes(data)
|
||||
|
||||
with STATE_LOCK:
|
||||
if out_name not in STATE["takes"]:
|
||||
STATE["takes"].append(out_name)
|
||||
STATE["takes_received"] = len(STATE["takes"])
|
||||
|
||||
return {"ok": True, "saved_as": out_name, "takes_received": STATE["takes_received"]}
|
||||
|
||||
|
||||
@app.post("/api/train")
|
||||
def train_now(payload: Dict[str, Any] = None):
|
||||
payload = payload or {}
|
||||
allow_no_personal = bool(payload.get("allow_no_personal", False))
|
||||
|
||||
with STATE_LOCK:
|
||||
safe_word = STATE["safe_word"]
|
||||
takes_received = int(STATE["takes_received"])
|
||||
speakers_total = int(STATE["speakers_total"])
|
||||
takes_per_speaker = int(STATE["takes_per_speaker"])
|
||||
training_running = bool(STATE["training"]["running"])
|
||||
|
||||
takes_total = speakers_total * takes_per_speaker
|
||||
|
||||
if training_running:
|
||||
return JSONResponse({"ok": False, "error": "Training already running"}, status_code=400)
|
||||
|
||||
if not safe_word:
|
||||
return JSONResponse({"ok": False, "error": "No active session"}, status_code=400)
|
||||
|
||||
min_required = max(1, min(3, takes_total))
|
||||
|
||||
if takes_received == 0 and not allow_no_personal:
|
||||
return JSONResponse(
|
||||
{
|
||||
"ok": False,
|
||||
"error": f"No personal voice samples recorded (0/{takes_total}).",
|
||||
"code": "NO_PERSONAL_SAMPLES",
|
||||
"message": "You can train without personal voices, or record samples first.",
|
||||
"takes_total": takes_total,
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
if 0 < takes_received < min_required:
|
||||
return JSONResponse(
|
||||
{
|
||||
"ok": False,
|
||||
"error": f"Not enough takes yet ({takes_received}/{takes_total}).",
|
||||
"code": "NOT_ENOUGH_TAKES",
|
||||
"min_required": min_required,
|
||||
"takes_total": takes_total,
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
t = threading.Thread(target=_run_training_background, args=(safe_word, allow_no_personal), daemon=True)
|
||||
t.start()
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"started": True,
|
||||
"safe_word": safe_word,
|
||||
"personal_samples_used": takes_received >= min_required,
|
||||
"allow_no_personal": allow_no_personal,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/train_status")
|
||||
def train_status(
|
||||
offset: int = Query(0, ge=0),
|
||||
max_bytes: int = Query(65536, ge=1024, le=262144),
|
||||
last_size: int = Query(0, ge=0),
|
||||
last_mtime: float = Query(0.0, ge=0.0),
|
||||
):
|
||||
"""
|
||||
Stream training output from the log file on disk.
|
||||
|
||||
Robust to log overwrite/truncation:
|
||||
- UI passes offset + last_size + last_mtime
|
||||
- If file shrinks or mtime goes backwards/changes weirdly, reset offset to 0
|
||||
"""
|
||||
with STATE_LOCK:
|
||||
tr = dict(STATE["training"])
|
||||
log_path_str = tr.get("log_path")
|
||||
|
||||
log_text = ""
|
||||
next_offset = offset
|
||||
log_size = 0
|
||||
log_mtime = 0.0
|
||||
|
||||
if log_path_str:
|
||||
p = Path(log_path_str)
|
||||
if p.exists():
|
||||
try:
|
||||
st = p.stat()
|
||||
log_size = int(st.st_size)
|
||||
log_mtime = float(st.st_mtime)
|
||||
|
||||
# Detect overwrite/truncate/reset:
|
||||
# - file shrank
|
||||
# - file mtime moved "backwards" (rare) or changed while size reset
|
||||
# If anything indicates a reset, restart from beginning.
|
||||
if (log_size < last_size) or (last_mtime and log_mtime < last_mtime):
|
||||
offset = 0
|
||||
|
||||
# Clamp offset to current file size
|
||||
if offset > log_size:
|
||||
offset = log_size
|
||||
|
||||
# Read incrementally from the file
|
||||
with p.open("rb") as f:
|
||||
f.seek(offset)
|
||||
chunk = f.read(max_bytes)
|
||||
|
||||
log_text = chunk.decode("utf-8", errors="replace")
|
||||
next_offset = offset + len(chunk)
|
||||
|
||||
except Exception as e:
|
||||
log_text = f"\n[log read error: {e!r}]\n"
|
||||
next_offset = offset
|
||||
|
||||
tr["log_text"] = log_text
|
||||
tr["next_offset"] = next_offset
|
||||
tr["log_size"] = log_size
|
||||
tr["log_mtime"] = log_mtime
|
||||
|
||||
return {"ok": True, "training": tr}
|
||||
|
||||
|
||||
@app.post("/api/reset_recordings")
|
||||
def reset_recordings():
|
||||
_reset_personal_samples_dir()
|
||||
with STATE_LOCK:
|
||||
STATE["takes_received"] = 0
|
||||
STATE["takes"] = []
|
||||
return {"ok": True}
|
||||
@@ -1,28 +1,10 @@
|
||||
# --- Core training (Microwakeword) ---
|
||||
# --- Packages needed by our scripts ---
|
||||
|
||||
numpy==1.26.4
|
||||
scipy==1.12.0
|
||||
librosa==0.10.2.post1
|
||||
soundfile==0.12.1
|
||||
soxr==0.5.0.post1
|
||||
audiomentations==0.38.0
|
||||
webrtcvad==2.0.10
|
||||
tqdm==4.67.1
|
||||
scikit-learn==1.6.0
|
||||
numba==0.60.0
|
||||
joblib==1.4.2
|
||||
pandas==2.2.3
|
||||
pymicro_features @ git+https://github.com/puddly/pymicro-features@e1d3f88183e12bb8af2df9e399ea157af7393762
|
||||
audio-metadata @ git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
|
||||
bitstruct==8.19.0
|
||||
|
||||
# --- Piper sample generation ---
|
||||
piper-tts>=1.2.0
|
||||
piper-phonemize-cross==1.2.1
|
||||
|
||||
# --- Notebook / tooling ---
|
||||
ipykernel==6.29.5
|
||||
jupyterlab==4.3.4
|
||||
ipywidgets==8.1.5
|
||||
matplotlib-inline==0.1.7
|
||||
rich==13.9.4
|
||||
numba==0.63.1
|
||||
PyYAML==6.0.3
|
||||
|
||||
64
run_recorder.sh
Normal file
64
run_recorder.sh
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOTDIR="$(dirname "$(realpath "$0")")"
|
||||
|
||||
# Training convention
|
||||
DATA_DIR="${DATA_DIR:-/data}"
|
||||
HOST="${REC_HOST:-0.0.0.0}"
|
||||
PORT="${REC_PORT:-8888}"
|
||||
|
||||
# Keep recorder deps separate from training venv
|
||||
VENV_DIR="${DATA_DIR}/.recorder-venv"
|
||||
PY="${VENV_DIR}/bin/python"
|
||||
PIP="${PY} -m pip"
|
||||
PIN_FILE="${VENV_DIR}/.pinned_installed"
|
||||
|
||||
FASTAPI_VERSION="${REC_FASTAPI_VERSION:-0.115.6}"
|
||||
UVICORN_VERSION="${REC_UVICORN_VERSION:-0.30.6}"
|
||||
PY_MULTIPART_VERSION="${REC_PY_MULTIPART_VERSION:-0.0.9}"
|
||||
|
||||
echo "microWakeWord Recorder (Docker)"
|
||||
echo "-> ROOTDIR: ${ROOTDIR}"
|
||||
echo "-> DATA_DIR: ${DATA_DIR}"
|
||||
echo "-> URL: http://localhost:${PORT}/"
|
||||
|
||||
mkdir -p "${DATA_DIR}"
|
||||
|
||||
# -----------------------------
|
||||
# Recorder venv (separate)
|
||||
# -----------------------------
|
||||
if [[ ! -x "${PY}" ]]; then
|
||||
echo "Creating recorder venv: ${VENV_DIR}"
|
||||
python3 -m venv "${VENV_DIR}"
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "${VENV_DIR}/bin/activate"
|
||||
|
||||
if [[ ! -f "${PIN_FILE}" ]]; then
|
||||
echo "Installing pinned recorder deps"
|
||||
${PIP} install -U pip setuptools wheel
|
||||
${PIP} install \
|
||||
"fastapi==${FASTAPI_VERSION}" \
|
||||
"uvicorn[standard]==${UVICORN_VERSION}" \
|
||||
"python-multipart==${PY_MULTIPART_VERSION}"
|
||||
touch "${PIN_FILE}"
|
||||
else
|
||||
echo "Reusing existing recorder venv (no upgrades)"
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# Recorder server env
|
||||
# -----------------------------
|
||||
export DATA_DIR="${DATA_DIR}"
|
||||
export STATIC_DIR="${ROOTDIR}/static"
|
||||
export PERSONAL_DIR="${DATA_DIR}/personal_samples"
|
||||
|
||||
# IMPORTANT: leave training venv creation to /api/train inside recorder_server.py
|
||||
# but still set TRAIN_CMD so the server knows how to invoke training once ready
|
||||
export TRAIN_CMD="source '${DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir='${DATA_DIR}'"
|
||||
|
||||
echo "Launching uvicorn on ${HOST}:${PORT}"
|
||||
cd "${ROOTDIR}"
|
||||
exec "${VENV_DIR}/bin/uvicorn" recorder_server:app --host "${HOST}" --port "${PORT}"
|
||||
23
startup.sh
23
startup.sh
@@ -1,23 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
: "${NB_UID:=0}"
|
||||
: "${NB_GID:=0}"
|
||||
umask 002
|
||||
|
||||
NOTEBOOK_SRC="/root/microWakeWord_training_notebook.ipynb"
|
||||
NOTEBOOK_DST="/data/microWakeWord_training_notebook.ipynb"
|
||||
|
||||
mkdir -p /data /data/generated_samples /data/personal_samples
|
||||
|
||||
if [[ ! -f "$NOTEBOOK_DST" ]]; then
|
||||
echo "No training notebook found in /data; copying default…"
|
||||
cp -n "$NOTEBOOK_SRC" "$NOTEBOOK_DST"
|
||||
fi
|
||||
|
||||
# Try to align ownership for convenience (ignore errors if not permitted)
|
||||
if [[ "$NB_UID" != "0" || "$NB_GID" != "0" ]]; then
|
||||
chown -R "$NB_UID:$NB_GID" /data || true
|
||||
fi
|
||||
|
||||
exec "$@"
|
||||
782
static/index.html
Normal file
782
static/index.html
Normal file
@@ -0,0 +1,782 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>microWakeWord Recorder</title>
|
||||
<style>
|
||||
:root{
|
||||
--bg: #070709;
|
||||
--panel: rgba(18, 18, 22, 0.78);
|
||||
--panel2: rgba(24, 24, 30, 0.86);
|
||||
--text: #e9e9ee;
|
||||
--muted: #a2a2ad;
|
||||
--line: rgba(255,255,255,0.10);
|
||||
--orange: #ff8a2a;
|
||||
--orange2:#ffb066;
|
||||
--ok:#38d39f;
|
||||
--warn:#ffb020;
|
||||
--err:#ff4a4a;
|
||||
--shadow: 0 18px 50px rgba(0,0,0,0.45);
|
||||
--radius: 16px;
|
||||
}
|
||||
|
||||
html, body { height: 100%; }
|
||||
body {
|
||||
margin: 0;
|
||||
color: var(--text);
|
||||
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
|
||||
background:
|
||||
radial-gradient(900px 500px at 12% 6%, rgba(255, 138, 42, 0.12), transparent 55%),
|
||||
radial-gradient(700px 420px at 80% 14%, rgba(255, 176, 102, 0.09), transparent 60%),
|
||||
radial-gradient(800px 600px at 50% 100%, rgba(255, 138, 42, 0.06), transparent 55%),
|
||||
linear-gradient(180deg, #050506 0%, #09090d 100%);
|
||||
}
|
||||
|
||||
.wrap { max-width: 940px; margin: 0 auto; padding: 26px 18px 42px; }
|
||||
|
||||
h2 { margin: 0 0 8px; font-size: 22px; letter-spacing: 0.2px; }
|
||||
p { margin: 0 0 14px; color: var(--muted); line-height: 1.45; }
|
||||
|
||||
.topbar {
|
||||
display:flex; align-items:center; justify-content:space-between;
|
||||
gap: 12px; margin-bottom: 14px;
|
||||
}
|
||||
|
||||
.brand { display:flex; align-items:center; gap:10px; }
|
||||
.logo {
|
||||
width: 38px; height: 38px; border-radius: 12px;
|
||||
background:
|
||||
radial-gradient(circle at 30% 30%, rgba(255,176,102,0.55), rgba(255,138,42,0.25) 45%, rgba(0,0,0,0) 72%),
|
||||
linear-gradient(180deg, rgba(255,138,42,0.22), rgba(255,138,42,0.06));
|
||||
border: 1px solid rgba(255,138,42,0.30);
|
||||
box-shadow: 0 10px 28px rgba(255,138,42,0.08);
|
||||
}
|
||||
|
||||
.row { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; }
|
||||
|
||||
.card {
|
||||
border: 1px solid var(--line);
|
||||
background: linear-gradient(180deg, var(--panel), var(--panel2));
|
||||
border-radius: var(--radius);
|
||||
padding: 16px;
|
||||
margin-top: 14px;
|
||||
box-shadow: var(--shadow);
|
||||
backdrop-filter: blur(8px);
|
||||
}
|
||||
|
||||
.muted { color: var(--muted); }
|
||||
|
||||
input[type="text"], input[type="number"]{
|
||||
padding: 11px 12px;
|
||||
font-size: 15px;
|
||||
border-radius: 12px;
|
||||
border: 1px solid rgba(255,255,255,0.12);
|
||||
background: rgba(0,0,0,0.35);
|
||||
color: var(--text);
|
||||
outline: none;
|
||||
}
|
||||
input[type="text"] { width: 420px; max-width: 100%; }
|
||||
input[type="number"] { width: 120px; }
|
||||
input::placeholder { color: rgba(233,233,238,0.35); }
|
||||
|
||||
button {
|
||||
padding: 10px 14px;
|
||||
font-size: 13px;
|
||||
cursor: pointer;
|
||||
border-radius: 12px;
|
||||
border: 1px solid rgba(255,255,255,0.14);
|
||||
background: rgba(255,255,255,0.06);
|
||||
color: var(--text);
|
||||
transition: transform 0.04s ease, border-color .15s ease, background .15s ease;
|
||||
}
|
||||
button:hover { border-color: rgba(255,138,42,0.35); background: rgba(255,255,255,0.08); }
|
||||
button:active { transform: translateY(1px); }
|
||||
button:disabled { opacity: 0.45; cursor: not-allowed; }
|
||||
|
||||
.primary {
|
||||
border-color: rgba(255,138,42,0.40);
|
||||
background: linear-gradient(180deg, rgba(255,138,42,0.24), rgba(255,138,42,0.12));
|
||||
}
|
||||
.primary:hover { border-color: rgba(255,138,42,0.65); }
|
||||
|
||||
.pill {
|
||||
display:inline-block;
|
||||
padding: 4px 10px;
|
||||
border-radius: 999px;
|
||||
background: rgba(255,255,255,0.07);
|
||||
border: 1px solid rgba(255,255,255,0.10);
|
||||
color: var(--muted);
|
||||
font-size: 12px;
|
||||
}
|
||||
.pill.ok { color: var(--ok); border-color: rgba(56,211,159,0.25); background: rgba(56,211,159,0.08); }
|
||||
.pill.warn { color: var(--warn); border-color: rgba(255,176,32,0.25); background: rgba(255,176,32,0.08); }
|
||||
.pill.err { color: var(--err); border-color: rgba(255,74,74,0.25); background: rgba(255,74,74,0.08); }
|
||||
|
||||
details { margin-top: 10px; }
|
||||
summary { cursor: pointer; color: var(--orange2); }
|
||||
summary:hover { color: var(--orange); }
|
||||
|
||||
label { display:flex; gap:10px; align-items:center; }
|
||||
input[type="range"] { width: 240px; }
|
||||
|
||||
.meter {
|
||||
height: 10px;
|
||||
background: rgba(255,255,255,0.08);
|
||||
border-radius: 999px;
|
||||
overflow: hidden;
|
||||
width: 280px;
|
||||
border: 1px solid rgba(255,255,255,0.10);
|
||||
}
|
||||
.meter > div {
|
||||
height: 10px;
|
||||
width: 0%;
|
||||
background: linear-gradient(90deg, rgba(255,138,42,0.55), rgba(255,176,102,0.85));
|
||||
}
|
||||
|
||||
pre {
|
||||
background: rgba(0,0,0,0.55);
|
||||
color: #e6e6ea;
|
||||
padding: 12px;
|
||||
border-radius: 14px;
|
||||
overflow: auto;
|
||||
max-height: 300px;
|
||||
border: 1px solid rgba(255,255,255,0.10);
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.big { font-size: 16px; }
|
||||
|
||||
.divider {
|
||||
height: 1px;
|
||||
width: 100%;
|
||||
background: rgba(255,255,255,0.10);
|
||||
margin: 12px 0;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="wrap">
|
||||
<div class="topbar">
|
||||
<div class="brand">
|
||||
<div class="logo"></div>
|
||||
<div>
|
||||
<h2>🎙️ microWakeWord Personal Recorder</h2>
|
||||
<p class="muted">Enter a wake word, test TTS pronunciation, then record takes. Recording starts when you speak and stops after silence.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="row">
|
||||
<input id="phrase" type="text" placeholder='e.g. "tater totterson"' />
|
||||
<button id="startSessionBtn" class="primary">Start session</button>
|
||||
<button id="ttsBtn" disabled>🔊 Test TTS</button>
|
||||
<span id="sessionPill" class="pill">No session</span>
|
||||
</div>
|
||||
|
||||
<div class="row" style="margin-top:10px;">
|
||||
<label class="muted">Speakers
|
||||
<input id="speakersTotal" type="number" min="1" max="10" value="1" />
|
||||
</label>
|
||||
<label class="muted">Takes / speaker
|
||||
<input id="takesPerSpeaker" type="number" min="1" max="50" value="10" />
|
||||
</label>
|
||||
<span id="speakerPill" class="pill">Speaker: -</span>
|
||||
</div>
|
||||
|
||||
<details>
|
||||
<summary>Advanced (if it’s too sensitive / not sensitive enough)</summary>
|
||||
<div style="margin-top:10px;">
|
||||
<label>
|
||||
Start sensitivity
|
||||
<input id="startThresh" type="range" min="0.005" max="0.08" step="0.001" value="0.02" />
|
||||
<span id="startThreshVal" class="muted"></span>
|
||||
</label>
|
||||
<label>
|
||||
Silence stop (ms)
|
||||
<input id="silenceMs" type="range" min="300" max="2000" step="50" value="900" />
|
||||
<span id="silenceMsVal" class="muted"></span>
|
||||
</label>
|
||||
<label>
|
||||
Min take length (ms)
|
||||
<input id="minTakeMs" type="range" min="300" max="2000" step="50" value="650" />
|
||||
<span id="minTakeMsVal" class="muted"></span>
|
||||
</label>
|
||||
</div>
|
||||
</details>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="row">
|
||||
<button id="beginBtn" disabled class="primary">🎬 Begin recording</button>
|
||||
<button id="resetBtn" disabled>🧹 Reset recordings</button>
|
||||
<button id="trainBtn" disabled>🧠 Start training</button>
|
||||
<span id="status" class="pill">Idle</span>
|
||||
</div>
|
||||
|
||||
<div style="margin-top:12px;" class="row">
|
||||
<div class="meter"><div id="meterFill"></div></div>
|
||||
<span class="muted" id="meterText">Mic level</span>
|
||||
</div>
|
||||
|
||||
<div class="divider"></div>
|
||||
|
||||
<p class="big">
|
||||
Speaker: <b id="speakerNum">-</b> / <b id="speakerTotal">-</b>
|
||||
<span id="speakerState" class="pill">Waiting</span>
|
||||
</p>
|
||||
|
||||
<p class="big">
|
||||
Take: <b id="takeNum">0</b> / <b id="takeTotal">10</b>
|
||||
<span id="takeState" class="pill">Not recording</span>
|
||||
</p>
|
||||
|
||||
<div id="takesList" class="muted"></div>
|
||||
|
||||
<h4 style="margin-top: 18px; margin-bottom: 10px;">Training log</h4>
|
||||
<pre id="trainLog">(no training started)</pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const $ = (id) => document.getElementById(id);
|
||||
|
||||
function setPill(el, text, cls) {
|
||||
el.className = "pill " + (cls || "");
|
||||
el.textContent = text;
|
||||
}
|
||||
|
||||
async function api(path, opts) {
|
||||
const res = await fetch(path, opts);
|
||||
const ct = res.headers.get("content-type") || "";
|
||||
const data = ct.includes("application/json") ? await res.json() : await res.text();
|
||||
if (!res.ok) {
|
||||
const err = (typeof data === "string") ? { error: data } : (data || {});
|
||||
const msg = err.error || err.message || JSON.stringify(err);
|
||||
const e = new Error(msg);
|
||||
e.details = err;
|
||||
throw e;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// -------------------- log auto-scroll (sticky to bottom) --------------------
|
||||
function isNearBottom(el, px = 40) {
|
||||
return (el.scrollHeight - el.scrollTop - el.clientHeight) <= px;
|
||||
}
|
||||
|
||||
function appendLogChunkAutoScroll(el, chunk) {
|
||||
if (!chunk) return;
|
||||
const stick = isNearBottom(el);
|
||||
el.textContent += chunk;
|
||||
if (stick) el.scrollTop = el.scrollHeight;
|
||||
}
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
let session = null;
|
||||
let isRunning = false;
|
||||
|
||||
let stream = null;
|
||||
let audioCtx = null;
|
||||
let analyser = null;
|
||||
let source = null;
|
||||
|
||||
let capturing = false;
|
||||
let startedAt = 0;
|
||||
let silenceStart = null;
|
||||
let floatChunks = [];
|
||||
let frameSize = 2048;
|
||||
|
||||
let currentSpeaker = 1;
|
||||
let speakersTotal = 1;
|
||||
|
||||
let currentTake = 0;
|
||||
let takesPerSpeaker = 10;
|
||||
|
||||
// --- incremental log streaming state ---
|
||||
// Polls /api/train_status?offset=<N> and appends training.log_text (reads /data/recorder_training.log)
|
||||
let trainOffset = 0;
|
||||
let trainingPollRunning = false;
|
||||
let trainingPollAbort = false;
|
||||
|
||||
function startThreshold() { return parseFloat($("startThresh").value); }
|
||||
function silenceStopMs() { return parseInt($("silenceMs").value, 10); }
|
||||
function minTakeMs() { return parseInt($("minTakeMs").value, 10); }
|
||||
|
||||
function updateAdvancedLabels() {
|
||||
$("startThreshVal").textContent = startThreshold().toFixed(3);
|
||||
$("silenceMsVal").textContent = silenceStopMs() + "ms";
|
||||
$("minTakeMsVal").textContent = minTakeMs() + "ms";
|
||||
}
|
||||
["startThresh","silenceMs","minTakeMs"].forEach(id => $(id).addEventListener("input", updateAdvancedLabels));
|
||||
updateAdvancedLabels();
|
||||
|
||||
function refreshUI() {
|
||||
$("speakerNum").textContent = String(currentSpeaker);
|
||||
$("speakerTotal").textContent = String(speakersTotal);
|
||||
$("takeNum").textContent = String(currentTake);
|
||||
$("takeTotal").textContent = String(takesPerSpeaker);
|
||||
setPill($("speakerPill"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||
}
|
||||
|
||||
// -------------------- mic lifecycle --------------------
|
||||
async function ensureMic() {
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
throw new Error("Microphone not available here. Use https:// (or http://localhost) to record.");
|
||||
}
|
||||
if (stream) return;
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
|
||||
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||
analyser = audioCtx.createAnalyser();
|
||||
analyser.fftSize = 2048;
|
||||
source = audioCtx.createMediaStreamSource(stream);
|
||||
source.connect(analyser);
|
||||
requestAnimationFrame(meterLoop);
|
||||
}
|
||||
|
||||
async function stopMicNow() {
|
||||
isRunning = false;
|
||||
capturing = false;
|
||||
|
||||
const proc = window.__mw_proc;
|
||||
if (proc) {
|
||||
try { proc.disconnect(); } catch {}
|
||||
try { source && source.disconnect(proc); } catch {}
|
||||
window.__mw_proc = null;
|
||||
}
|
||||
|
||||
if (stream) {
|
||||
try { stream.getTracks().forEach(t => t.stop()); } catch {}
|
||||
stream = null;
|
||||
}
|
||||
|
||||
if (audioCtx) {
|
||||
try { await audioCtx.close(); } catch {}
|
||||
audioCtx = null;
|
||||
}
|
||||
|
||||
analyser = null;
|
||||
source = null;
|
||||
|
||||
$("meterFill").style.width = "0%";
|
||||
$("meterText").textContent = "Mic stopped";
|
||||
}
|
||||
|
||||
function meterLoop() {
|
||||
if (!analyser) {
|
||||
requestAnimationFrame(meterLoop);
|
||||
return;
|
||||
}
|
||||
|
||||
const data = new Uint8Array(analyser.fftSize);
|
||||
analyser.getByteTimeDomainData(data);
|
||||
|
||||
let sumSq = 0;
|
||||
for (let i=0;i<data.length;i++){
|
||||
const v = (data[i] - 128) / 128;
|
||||
sumSq += v*v;
|
||||
}
|
||||
const rms = Math.sqrt(sumSq / data.length);
|
||||
const pct = Math.min(100, Math.max(0, rms * 600));
|
||||
$("meterFill").style.width = pct + "%";
|
||||
$("meterText").textContent = `Mic level (rms=${rms.toFixed(3)})`;
|
||||
|
||||
if (isRunning) recorderTick(rms);
|
||||
|
||||
requestAnimationFrame(meterLoop);
|
||||
}
|
||||
|
||||
// -------------------- recording state machine --------------------
|
||||
function recorderTick(rms) {
|
||||
const now = performance.now();
|
||||
|
||||
if (!capturing) {
|
||||
if (rms >= startThreshold()) startCapture();
|
||||
return;
|
||||
}
|
||||
|
||||
if (rms < startThreshold() * 0.65) {
|
||||
if (silenceStart === null) silenceStart = now;
|
||||
const silentFor = now - silenceStart;
|
||||
if (silentFor >= silenceStopMs()) {
|
||||
const dur = now - startedAt;
|
||||
if (dur >= minTakeMs()) stopCaptureAndUpload();
|
||||
else silenceStart = now;
|
||||
}
|
||||
} else {
|
||||
silenceStart = null;
|
||||
}
|
||||
}
|
||||
|
||||
async function startCapture() {
|
||||
capturing = true;
|
||||
startedAt = performance.now();
|
||||
silenceStart = null;
|
||||
floatChunks = [];
|
||||
|
||||
setPill($("takeState"), "Recording…", "warn");
|
||||
|
||||
const proc = audioCtx.createScriptProcessor(frameSize, 1, 1);
|
||||
source.connect(proc);
|
||||
proc.connect(audioCtx.destination);
|
||||
|
||||
proc.onaudioprocess = (ev) => {
|
||||
if (!capturing) return;
|
||||
const chan = ev.inputBuffer.getChannelData(0);
|
||||
floatChunks.push(new Float32Array(chan));
|
||||
};
|
||||
|
||||
window.__mw_proc = proc;
|
||||
}
|
||||
|
||||
async function stopCaptureAndUpload() {
|
||||
capturing = false;
|
||||
setPill($("takeState"), "Processing…");
|
||||
|
||||
const proc = window.__mw_proc;
|
||||
if (proc) {
|
||||
try { proc.disconnect(); } catch {}
|
||||
try { source.disconnect(proc); } catch {}
|
||||
window.__mw_proc = null;
|
||||
}
|
||||
|
||||
currentTake += 1;
|
||||
refreshUI();
|
||||
|
||||
let totalLen = 0;
|
||||
for (const c of floatChunks) totalLen += c.length;
|
||||
const merged = new Float32Array(totalLen);
|
||||
let off = 0;
|
||||
for (const c of floatChunks) { merged.set(c, off); off += c.length; }
|
||||
|
||||
const wavBlob = await floatToWav16kMono(merged, audioCtx.sampleRate);
|
||||
|
||||
try {
|
||||
setPill($("status"), `Uploading speaker ${currentSpeaker} take ${currentTake}…`, "warn");
|
||||
|
||||
const fd = new FormData();
|
||||
fd.append("speaker_index", String(currentSpeaker));
|
||||
fd.append("take_index", String(currentTake));
|
||||
fd.append("file", wavBlob, `take_${String(currentTake).padStart(2,"0")}.wav`);
|
||||
|
||||
await api("/api/upload_take", { method:"POST", body: fd });
|
||||
|
||||
$("takesList").textContent = `Saved ${currentTake}/${takesPerSpeaker} takes for speaker ${currentSpeaker}/${speakersTotal}`;
|
||||
setPill($("status"), `Saved speaker ${currentSpeaker} take ${currentTake}/${takesPerSpeaker}`, "ok");
|
||||
|
||||
if (currentTake >= takesPerSpeaker) {
|
||||
if (currentSpeaker >= speakersTotal) {
|
||||
setPill($("takeState"), "Done", "ok");
|
||||
setPill($("speakerState"), "All speakers done ✅", "ok");
|
||||
setPill($("status"), "All takes recorded ✅", "ok");
|
||||
|
||||
await stopMicNow();
|
||||
await autoStartTraining();
|
||||
return;
|
||||
}
|
||||
|
||||
currentSpeaker += 1;
|
||||
currentTake = 0;
|
||||
refreshUI();
|
||||
|
||||
setPill($("speakerState"), `Speaker ${currentSpeaker - 1} complete ✅`, "ok");
|
||||
setPill($("takeState"), "Paused", "warn");
|
||||
setPill($("status"), `Ready for speaker ${currentSpeaker}. Click Begin recording.`, "warn");
|
||||
|
||||
isRunning = false;
|
||||
$("beginBtn").disabled = false;
|
||||
|
||||
await stopMicNow();
|
||||
return;
|
||||
}
|
||||
|
||||
setPill($("speakerState"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||
setPill($("takeState"), "Listening…", "ok");
|
||||
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
setPill($("status"), "Upload failed", "err");
|
||||
setPill($("takeState"), "Error", "err");
|
||||
isRunning = false;
|
||||
$("beginBtn").disabled = false;
|
||||
alert("Upload failed: " + e.message);
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------- WAV encoding helpers --------------------
|
||||
async function floatToWav16kMono(float32, srcRate) {
|
||||
const buf = audioCtx.createBuffer(1, float32.length, srcRate);
|
||||
buf.copyToChannel(float32, 0);
|
||||
|
||||
const targetRate = 16000;
|
||||
const targetLen = Math.max(1, Math.round(float32.length * targetRate / srcRate));
|
||||
const offline = new OfflineAudioContext(1, targetLen, targetRate);
|
||||
|
||||
const src = offline.createBufferSource();
|
||||
src.buffer = buf;
|
||||
src.connect(offline.destination);
|
||||
src.start(0);
|
||||
|
||||
const rendered = await offline.startRendering();
|
||||
const data = rendered.getChannelData(0);
|
||||
|
||||
const wav = encodeWavPCM16(data, targetRate);
|
||||
return new Blob([wav], { type: "audio/wav" });
|
||||
}
|
||||
|
||||
function encodeWavPCM16(float32, sampleRate) {
|
||||
const numSamples = float32.length;
|
||||
const buffer = new ArrayBuffer(44 + numSamples * 2);
|
||||
const view = new DataView(buffer);
|
||||
|
||||
function writeString(offset, str) {
|
||||
for (let i=0;i<str.length;i++) view.setUint8(offset+i, str.charCodeAt(i));
|
||||
}
|
||||
|
||||
writeString(0, "RIFF");
|
||||
view.setUint32(4, 36 + numSamples * 2, true);
|
||||
writeString(8, "WAVE");
|
||||
|
||||
writeString(12, "fmt ");
|
||||
view.setUint32(16, 16, true);
|
||||
view.setUint16(20, 1, true);
|
||||
view.setUint16(22, 1, true);
|
||||
view.setUint32(24, sampleRate, true);
|
||||
view.setUint32(28, sampleRate * 2, true);
|
||||
view.setUint16(32, 2, true);
|
||||
view.setUint16(34, 16, true);
|
||||
|
||||
writeString(36, "data");
|
||||
view.setUint32(40, numSamples * 2, true);
|
||||
|
||||
let offset = 44;
|
||||
for (let i=0;i<numSamples;i++) {
|
||||
let s = Math.max(-1, Math.min(1, float32[i]));
|
||||
const v = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
view.setInt16(offset, v, true);
|
||||
offset += 2;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// -------------------- training (manual + auto) --------------------
|
||||
async function startTrainingWithPrompt(auto=false) {
|
||||
const sess = await api("/api/session", { method: "GET" });
|
||||
const takesReceived = sess.takes_received || 0;
|
||||
const total = (sess.speakers_total || 1) * (sess.takes_per_speaker || 10);
|
||||
|
||||
let allowNoPersonal = false;
|
||||
|
||||
if (takesReceived === 0) {
|
||||
const ok = confirm(
|
||||
`No personal voice samples recorded (0/${total}).\n\nTrain anyway WITHOUT personal voices?`
|
||||
);
|
||||
if (!ok) return;
|
||||
allowNoPersonal = true;
|
||||
}
|
||||
|
||||
// lock UI immediately
|
||||
$("trainBtn").disabled = true;
|
||||
$("beginBtn").disabled = true;
|
||||
$("resetBtn").disabled = true;
|
||||
|
||||
setPill($("status"), auto ? "Auto-starting training…" : "Preparing training environment…", "warn");
|
||||
|
||||
// reset streaming log state (we show recorder_training.log from the start of this run)
|
||||
trainOffset = 0;
|
||||
trainingPollAbort = false;
|
||||
|
||||
const logEl = $("trainLog");
|
||||
logEl.textContent = "(preparing…)\n";
|
||||
|
||||
try {
|
||||
// Kick off training first
|
||||
await api("/api/train", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ allow_no_personal: allowNoPersonal })
|
||||
});
|
||||
|
||||
// Only start polling AFTER training was successfully kicked off
|
||||
if (!trainingPollRunning) {
|
||||
trainingPollRunning = true;
|
||||
pollTrainingIncremental();
|
||||
}
|
||||
|
||||
setPill($("status"), "Training running…", "warn");
|
||||
} catch (e) {
|
||||
$("trainBtn").disabled = false;
|
||||
$("resetBtn").disabled = false;
|
||||
$("beginBtn").disabled = false;
|
||||
trainingPollAbort = true;
|
||||
trainingPollRunning = false;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
async function autoStartTraining() {
|
||||
try {
|
||||
await startTrainingWithPrompt(true);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
setPill($("status"), "Auto-train failed", "err");
|
||||
alert("Auto-start training failed: " + e.message);
|
||||
}
|
||||
}
|
||||
|
||||
$("trainBtn").addEventListener("click", async () => {
|
||||
try {
|
||||
await startTrainingWithPrompt(false);
|
||||
} catch (e) {
|
||||
alert("Train failed: " + e.message);
|
||||
setPill($("status"), "Train failed", "err");
|
||||
}
|
||||
});
|
||||
|
||||
// Polls /api/train_status?offset=<trainOffset>
|
||||
// Expects JSON: { ok: true, training: { running, exit_code, log_text, next_offset } }
|
||||
async function pollTrainingIncremental() {
|
||||
const logEl = $("trainLog");
|
||||
|
||||
for (;;) {
|
||||
if (trainingPollAbort) {
|
||||
trainingPollRunning = false;
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const st = await api(`/api/train_status?offset=${trainOffset}`, { method:"GET" });
|
||||
const tr = st.training || {};
|
||||
|
||||
const chunk = tr.log_text || "";
|
||||
const next = (typeof tr.next_offset === "number") ? tr.next_offset : trainOffset;
|
||||
|
||||
// If we got real output, replace the "(preparing…)" placeholder
|
||||
if (chunk && logEl.textContent.startsWith("(preparing…)")) {
|
||||
logEl.textContent = "";
|
||||
}
|
||||
|
||||
if (chunk) appendLogChunkAutoScroll(logEl, chunk);
|
||||
|
||||
trainOffset = next;
|
||||
|
||||
// Stop polling only when training has ended and exit_code is set
|
||||
const exitCodeIsSet = (tr.exit_code !== null && tr.exit_code !== undefined);
|
||||
|
||||
if (!tr.running && exitCodeIsSet) {
|
||||
$("trainBtn").disabled = false;
|
||||
$("resetBtn").disabled = false;
|
||||
$("beginBtn").disabled = false;
|
||||
|
||||
if (tr.exit_code === 0) setPill($("status"), "Training finished ✅", "ok");
|
||||
else setPill($("status"), `Training ended (exit=${tr.exit_code})`, "err");
|
||||
|
||||
trainingPollRunning = false;
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
// ignore transient polling errors
|
||||
}
|
||||
|
||||
await new Promise(r => setTimeout(r, 1500));
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------- session + UI wiring --------------------
|
||||
$("ttsBtn").addEventListener("click", () => {
|
||||
const phrase = ($("phrase").value || "").trim();
|
||||
if (!phrase) return;
|
||||
const u = new SpeechSynthesisUtterance(phrase);
|
||||
speechSynthesis.cancel();
|
||||
speechSynthesis.speak(u);
|
||||
});
|
||||
|
||||
$("startSessionBtn").addEventListener("click", async () => {
|
||||
const phrase = ($("phrase").value || "").trim();
|
||||
if (!phrase) { alert("Enter a wake word phrase first."); return; }
|
||||
|
||||
speakersTotal = parseInt($("speakersTotal").value || "1", 10);
|
||||
takesPerSpeaker = parseInt($("takesPerSpeaker").value || "10", 10);
|
||||
|
||||
try {
|
||||
setPill($("sessionPill"), "Starting…", "warn");
|
||||
const data = await api("/api/start_session", {
|
||||
method: "POST",
|
||||
headers: {"Content-Type":"application/json"},
|
||||
body: JSON.stringify({ phrase, speakers_total: speakersTotal, takes_per_speaker: takesPerSpeaker })
|
||||
});
|
||||
|
||||
session = data;
|
||||
|
||||
currentSpeaker = 1;
|
||||
currentTake = 0;
|
||||
|
||||
$("takesList").textContent = "";
|
||||
$("trainLog").textContent = "(no training started)";
|
||||
|
||||
trainOffset = 0;
|
||||
|
||||
// If a previous training poll loop is running, ask it to stop
|
||||
trainingPollAbort = true;
|
||||
trainingPollRunning = false;
|
||||
|
||||
refreshUI();
|
||||
|
||||
await stopMicNow();
|
||||
|
||||
setPill($("sessionPill"), `Session: ${data.safe_word}`, "ok");
|
||||
$("beginBtn").disabled = false;
|
||||
$("resetBtn").disabled = false;
|
||||
$("trainBtn").disabled = false;
|
||||
$("ttsBtn").disabled = false;
|
||||
|
||||
setPill($("status"), "Ready", "ok");
|
||||
setPill($("speakerState"), "Waiting");
|
||||
setPill($("takeState"), "Not recording");
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
setPill($("sessionPill"), "Session failed", "err");
|
||||
alert("Start session failed: " + e.message);
|
||||
} finally {
|
||||
trainingPollAbort = false;
|
||||
}
|
||||
});
|
||||
|
||||
$("resetBtn").addEventListener("click", async () => {
|
||||
try {
|
||||
await api("/api/reset_recordings", {method:"POST"});
|
||||
currentSpeaker = 1;
|
||||
currentTake = 0;
|
||||
$("takesList").textContent = "";
|
||||
refreshUI();
|
||||
setPill($("status"), "Recordings reset", "ok");
|
||||
} catch (e) {
|
||||
alert("Reset failed: " + e.message);
|
||||
}
|
||||
});
|
||||
|
||||
$("beginBtn").addEventListener("click", async () => {
|
||||
if (!session) { alert("Start a session first."); return; }
|
||||
try {
|
||||
await ensureMic();
|
||||
} catch (e) {
|
||||
alert("Mic permission failed: " + e.message);
|
||||
return;
|
||||
}
|
||||
|
||||
$("takesList").textContent = "";
|
||||
refreshUI();
|
||||
|
||||
isRunning = true;
|
||||
$("beginBtn").disabled = true;
|
||||
|
||||
setPill($("speakerState"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||
setPill($("status"), "Listening… say the wake word now", "ok");
|
||||
setPill($("takeState"), "Listening…", "ok");
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
31
cli/train_wake_word → train_wake_word
Executable file → Normal file
31
cli/train_wake_word → train_wake_word
Executable file → Normal file
@@ -3,9 +3,10 @@ set -e
|
||||
|
||||
PROGPATH=$(realpath "$0")
|
||||
PROGDIR=$(dirname "${PROGPATH}")
|
||||
CLIDIR="${PROGDIR}/cli"
|
||||
|
||||
KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
|
||||
source "${PROGDIR}/shell.functions"
|
||||
source "${CLIDIR}/shell.functions"
|
||||
WAKE_WORD=${POSITIONAL_ARGS[0]}
|
||||
|
||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||
@@ -62,7 +63,7 @@ fi
|
||||
|
||||
printf "%-80s\n" "=" | tr ' ' "="
|
||||
echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
|
||||
"${PROGDIR}/cudainfo"
|
||||
"${CLIDIR}/cudainfo"
|
||||
echo
|
||||
START_TS=$EPOCHSECONDS
|
||||
|
||||
@@ -75,17 +76,13 @@ export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
||||
export GLOG_minloglevel=2
|
||||
export GRPC_VERBOSITY=ERROR
|
||||
|
||||
|
||||
"${PROGDIR}/wake_word_sample_generator" \
|
||||
"${CLIDIR}/wake_word_sample_generator" \
|
||||
--samples=${SAMPLES} \
|
||||
--batch-size=${BATCH_SIZE} \
|
||||
--data-dir="${DATA_DIR}" "${WAKE_WORD}"
|
||||
|
||||
POST_GEN_TS=$EPOCHSECONDS
|
||||
|
||||
ww="${WAKE_WORD// /_}"
|
||||
ww="${ww//./}"
|
||||
|
||||
AUGMENT=false
|
||||
GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
|
||||
AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
||||
@@ -96,7 +93,7 @@ AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
||||
if ${AUGMENT} ; then
|
||||
rm -rf "${AUGMENTED_DIR}" || :
|
||||
mkdir -p "${AUGMENTED_DIR}" || :
|
||||
"${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
||||
"${CLIDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
||||
else
|
||||
echo "Augmentation not required"
|
||||
echo
|
||||
@@ -104,18 +101,26 @@ fi
|
||||
|
||||
POST_AUGMENT_TS=$EPOCHSECONDS
|
||||
|
||||
"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \
|
||||
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
||||
"${CLIDIR}/wake_word_sample_trainer" \
|
||||
--samples=${SAMPLES} \
|
||||
--training-steps=${TRAINING_STEPS} \
|
||||
--data-dir="${DATA_DIR}" \
|
||||
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
||||
|
||||
if ${CLEANUP_WORK_DIR} ; then
|
||||
rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \
|
||||
"${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || :
|
||||
rm -rf \
|
||||
"${DATA_DIR}/work/trained_models" \
|
||||
"${DATA_DIR}/work/wake_word_samples" \
|
||||
"${DATA_DIR}/work/wake_word_samples_augmented" \
|
||||
"${DATA_DIR}/work/personal_augmented_features" \
|
||||
"${DATA_DIR}/work/last_wake_word" || :
|
||||
fi
|
||||
|
||||
END_TS=$EPOCHSECONDS
|
||||
|
||||
python -c $'print(f"{\'=\' * 80}")'
|
||||
printf "%44s\n\n" "Training Summary"
|
||||
"${PROGDIR}/system_summary"
|
||||
"${CLIDIR}/system_summary"
|
||||
echo
|
||||
print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
|
||||
print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
|
||||
Reference in New Issue
Block a user