mirror of
https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git
synced 2026-06-12 20:10:19 -06:00
cli + web recorder ui
This commit is contained in:
201
LICENSE
201
LICENSE
@@ -1,201 +0,0 @@
|
|||||||
Apache License
|
|
||||||
Version 2.0, January 2004
|
|
||||||
http://www.apache.org/licenses/
|
|
||||||
|
|
||||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
||||||
|
|
||||||
1. Definitions.
|
|
||||||
|
|
||||||
"License" shall mean the terms and conditions for use, reproduction,
|
|
||||||
and distribution as defined by Sections 1 through 9 of this document.
|
|
||||||
|
|
||||||
"Licensor" shall mean the copyright owner or entity authorized by
|
|
||||||
the copyright owner that is granting the License.
|
|
||||||
|
|
||||||
"Legal Entity" shall mean the union of the acting entity and all
|
|
||||||
other entities that control, are controlled by, or are under common
|
|
||||||
control with that entity. For the purposes of this definition,
|
|
||||||
"control" means (i) the power, direct or indirect, to cause the
|
|
||||||
direction or management of such entity, whether by contract or
|
|
||||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
||||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
||||||
|
|
||||||
"You" (or "Your") shall mean an individual or Legal Entity
|
|
||||||
exercising permissions granted by this License.
|
|
||||||
|
|
||||||
"Source" form shall mean the preferred form for making modifications,
|
|
||||||
including but not limited to software source code, documentation
|
|
||||||
source, and configuration files.
|
|
||||||
|
|
||||||
"Object" form shall mean any form resulting from mechanical
|
|
||||||
transformation or translation of a Source form, including but
|
|
||||||
not limited to compiled object code, generated documentation,
|
|
||||||
and conversions to other media types.
|
|
||||||
|
|
||||||
"Work" shall mean the work of authorship, whether in Source or
|
|
||||||
Object form, made available under the License, as indicated by a
|
|
||||||
copyright notice that is included in or attached to the work
|
|
||||||
(an example is provided in the Appendix below).
|
|
||||||
|
|
||||||
"Derivative Works" shall mean any work, whether in Source or Object
|
|
||||||
form, that is based on (or derived from) the Work and for which the
|
|
||||||
editorial revisions, annotations, elaborations, or other modifications
|
|
||||||
represent, as a whole, an original work of authorship. For the purposes
|
|
||||||
of this License, Derivative Works shall not include works that remain
|
|
||||||
separable from, or merely link (or bind by name) to the interfaces of,
|
|
||||||
the Work and Derivative Works thereof.
|
|
||||||
|
|
||||||
"Contribution" shall mean any work of authorship, including
|
|
||||||
the original version of the Work and any modifications or additions
|
|
||||||
to that Work or Derivative Works thereof, that is intentionally
|
|
||||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
||||||
or by an individual or Legal Entity authorized to submit on behalf of
|
|
||||||
the copyright owner. For the purposes of this definition, "submitted"
|
|
||||||
means any form of electronic, verbal, or written communication sent
|
|
||||||
to the Licensor or its representatives, including but not limited to
|
|
||||||
communication on electronic mailing lists, source code control systems,
|
|
||||||
and issue tracking systems that are managed by, or on behalf of, the
|
|
||||||
Licensor for the purpose of discussing and improving the Work, but
|
|
||||||
excluding communication that is conspicuously marked or otherwise
|
|
||||||
designated in writing by the copyright owner as "Not a Contribution."
|
|
||||||
|
|
||||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
||||||
on behalf of whom a Contribution has been received by Licensor and
|
|
||||||
subsequently incorporated within the Work.
|
|
||||||
|
|
||||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
copyright license to reproduce, prepare Derivative Works of,
|
|
||||||
publicly display, publicly perform, sublicense, and distribute the
|
|
||||||
Work and such Derivative Works in Source or Object form.
|
|
||||||
|
|
||||||
3. Grant of Patent License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
(except as stated in this section) patent license to make, have made,
|
|
||||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
||||||
where such license applies only to those patent claims licensable
|
|
||||||
by such Contributor that are necessarily infringed by their
|
|
||||||
Contribution(s) alone or by combination of their Contribution(s)
|
|
||||||
with the Work to which such Contribution(s) was submitted. If You
|
|
||||||
institute patent litigation against any entity (including a
|
|
||||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
||||||
or a Contribution incorporated within the Work constitutes direct
|
|
||||||
or contributory patent infringement, then any patent licenses
|
|
||||||
granted to You under this License for that Work shall terminate
|
|
||||||
as of the date such litigation is filed.
|
|
||||||
|
|
||||||
4. Redistribution. You may reproduce and distribute copies of the
|
|
||||||
Work or Derivative Works thereof in any medium, with or without
|
|
||||||
modifications, and in Source or Object form, provided that You
|
|
||||||
meet the following conditions:
|
|
||||||
|
|
||||||
(a) You must give any other recipients of the Work or
|
|
||||||
Derivative Works a copy of this License; and
|
|
||||||
|
|
||||||
(b) You must cause any modified files to carry prominent notices
|
|
||||||
stating that You changed the files; and
|
|
||||||
|
|
||||||
(c) You must retain, in the Source form of any Derivative Works
|
|
||||||
that You distribute, all copyright, patent, trademark, and
|
|
||||||
attribution notices from the Source form of the Work,
|
|
||||||
excluding those notices that do not pertain to any part of
|
|
||||||
the Derivative Works; and
|
|
||||||
|
|
||||||
(d) If the Work includes a "NOTICE" text file as part of its
|
|
||||||
distribution, then any Derivative Works that You distribute must
|
|
||||||
include a readable copy of the attribution notices contained
|
|
||||||
within such NOTICE file, excluding those notices that do not
|
|
||||||
pertain to any part of the Derivative Works, in at least one
|
|
||||||
of the following places: within a NOTICE text file distributed
|
|
||||||
as part of the Derivative Works; within the Source form or
|
|
||||||
documentation, if provided along with the Derivative Works; or,
|
|
||||||
within a display generated by the Derivative Works, if and
|
|
||||||
wherever such third-party notices normally appear. The contents
|
|
||||||
of the NOTICE file are for informational purposes only and
|
|
||||||
do not modify the License. You may add Your own attribution
|
|
||||||
notices within Derivative Works that You distribute, alongside
|
|
||||||
or as an addendum to the NOTICE text from the Work, provided
|
|
||||||
that such additional attribution notices cannot be construed
|
|
||||||
as modifying the License.
|
|
||||||
|
|
||||||
You may add Your own copyright statement to Your modifications and
|
|
||||||
may provide additional or different license terms and conditions
|
|
||||||
for use, reproduction, or distribution of Your modifications, or
|
|
||||||
for any such Derivative Works as a whole, provided Your use,
|
|
||||||
reproduction, and distribution of the Work otherwise complies with
|
|
||||||
the conditions stated in this License.
|
|
||||||
|
|
||||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
||||||
any Contribution intentionally submitted for inclusion in the Work
|
|
||||||
by You to the Licensor shall be under the terms and conditions of
|
|
||||||
this License, without any additional terms or conditions.
|
|
||||||
Notwithstanding the above, nothing herein shall supersede or modify
|
|
||||||
the terms of any separate license agreement you may have executed
|
|
||||||
with Licensor regarding such Contributions.
|
|
||||||
|
|
||||||
6. Trademarks. This License does not grant permission to use the trade
|
|
||||||
names, trademarks, service marks, or product names of the Licensor,
|
|
||||||
except as required for reasonable and customary use in describing the
|
|
||||||
origin of the Work and reproducing the content of the NOTICE file.
|
|
||||||
|
|
||||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
||||||
agreed to in writing, Licensor provides the Work (and each
|
|
||||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
||||||
implied, including, without limitation, any warranties or conditions
|
|
||||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
||||||
appropriateness of using or redistributing the Work and assume any
|
|
||||||
risks associated with Your exercise of permissions under this License.
|
|
||||||
|
|
||||||
8. Limitation of Liability. In no event and under no legal theory,
|
|
||||||
whether in tort (including negligence), contract, or otherwise,
|
|
||||||
unless required by applicable law (such as deliberate and grossly
|
|
||||||
negligent acts) or agreed to in writing, shall any Contributor be
|
|
||||||
liable to You for damages, including any direct, indirect, special,
|
|
||||||
incidental, or consequential damages of any character arising as a
|
|
||||||
result of this License or out of the use or inability to use the
|
|
||||||
Work (including but not limited to damages for loss of goodwill,
|
|
||||||
work stoppage, computer failure or malfunction, or any and all
|
|
||||||
other commercial damages or losses), even if such Contributor
|
|
||||||
has been advised of the possibility of such damages.
|
|
||||||
|
|
||||||
9. Accepting Warranty or Additional Liability. While redistributing
|
|
||||||
the Work or Derivative Works thereof, You may choose to offer,
|
|
||||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
||||||
or other liability obligations and/or rights consistent with this
|
|
||||||
License. However, in accepting such obligations, You may act only
|
|
||||||
on Your own behalf and on Your sole responsibility, not on behalf
|
|
||||||
of any other Contributor, and only if You agree to indemnify,
|
|
||||||
defend, and hold each Contributor harmless for any liability
|
|
||||||
incurred by, or claims asserted against, such Contributor by reason
|
|
||||||
of your accepting any such warranty or additional liability.
|
|
||||||
|
|
||||||
END OF TERMS AND CONDITIONS
|
|
||||||
|
|
||||||
APPENDIX: How to apply the Apache License to your work.
|
|
||||||
|
|
||||||
To apply the Apache License to your work, attach the following
|
|
||||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
||||||
replaced with your own identifying information. (Don't include
|
|
||||||
the brackets!) The text should be enclosed in the appropriate
|
|
||||||
comment syntax for the file format. We also recommend that a
|
|
||||||
file or class name and description of purpose be included on the
|
|
||||||
same "printed page" as the copyright notice for easier
|
|
||||||
identification within third-party archives.
|
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
540
README.md
540
README.md
@@ -1,123 +1,507 @@
|
|||||||
<div align="center">
|
# Run training from the command line
|
||||||
<img src="https://raw.githubusercontent.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker/refs/heads/main/mmw.png" alt="MicroWakeWord Trainer Logo" width="100" />
|
|
||||||
<h1>microWakeWord Trainer Docker</h1>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
# 🥔 MicroWakeWord Trainer – Tater Approved
|
## Overview
|
||||||
|
|
||||||
**✅ Tater Totterson tested & working on an NVIDIA RTX 3070 Laptop GPU (8 GB VRAM).**
|
With these scripts and Dockerfile, you can train new wake words from the
|
||||||
Easily train microWakeWord detection models with this pre-built Docker image and JupyterLab notebook.
|
command line without using a Jupyter notebook.
|
||||||
|
|
||||||
---
|
Differences between this Docker image and the Jupyter notebook image:
|
||||||
|
|
||||||
## 🚀 Quick Start
|
* The Python training environment isn't included in the image. Instead, a
|
||||||
|
"virtual environment" (venv) is created in the `/data` directory which you
|
||||||
|
will have mounted to a host directory. This cuts about 7gb from the image
|
||||||
|
and allows the virtualenv to persist across container instances.
|
||||||
|
|
||||||
Follow these steps to get up and running:
|
* The logic from the Jupyter notebook is contained in individual Python
|
||||||
|
and shell scripts
|
||||||
|
|
||||||
### 1️⃣ Pull the Pre-Built Docker Image
|
* No ports need to be exposed since the Jupyter notebook server isn't being
|
||||||
|
run.
|
||||||
|
|
||||||
```bash
|
## TL;DR
|
||||||
docker pull ghcr.io/tatertotterson/microwakeword:latest
|
|
||||||
|
For the impatient among you...
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
||||||
|
$ docker build -t microwakeword-cli:latest .
|
||||||
|
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
||||||
|
root@mww-cli:/# cd /data
|
||||||
|
root@mww-cli:/data# setup_python_venv
|
||||||
|
##### You have about 4 minutes to drink coffee
|
||||||
|
|
||||||
|
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
||||||
|
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
||||||
|
|
||||||
|
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
||||||
|
##### You have about 30-45 minutes for a nap depending on available system resources.
|
||||||
|
##### You'll be informed of where to find your trained model.
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
Load the trained model on your device and give it a try but don't be surprized
|
||||||
|
if you get a lot of missed or false activations. Read on to find out why.
|
||||||
|
|
||||||
### 2️⃣ Run the Docker Container
|
## Get Started
|
||||||
|
|
||||||
```bash
|
Good, you stuck around! Now read the rest of the document before doing
|
||||||
docker run --rm -it \
|
anything.
|
||||||
--gpus all \
|
|
||||||
-p 8888:8888 \
|
### Using a GPU
|
||||||
-v $(pwd):/data \
|
|
||||||
ghcr.io/tatertotterson/microwakeword:latest
|
Having an Nvidia GPU available can cut the training time by up to half. The
|
||||||
|
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
||||||
|
however so if you have an Nvidia GPU and want to use it for training, you'll
|
||||||
|
need to install the official Nvidia driver from
|
||||||
|
https://www.nvidia.com/en-in/drivers/unix/
|
||||||
|
|
||||||
|
### Build the image
|
||||||
|
|
||||||
|
You can use either Docker or Podman as your container management tool.
|
||||||
|
`docker` is used in the examples but if you have podman, just substitute
|
||||||
|
the command.
|
||||||
|
|
||||||
|
Start by navigating to the directory that contains this README file and
|
||||||
|
the accompanying Dockerfile. Then...
|
||||||
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker build -t microwakeword-cli:latest .
|
||||||
```
|
```
|
||||||
|
|
||||||
**What these flags do:**
|
This should be fairly quick and result in an image that's about 320mb in size
|
||||||
- `--gpus all` → Enables GPU acceleration
|
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
||||||
- `-p 8888:8888` → Exposes JupyterLab on port 8888
|
|
||||||
- `-v $(pwd):/data` → Saves your work in the current folder
|
|
||||||
|
|
||||||
---
|
So why isn't a pre-built image available for download? Because it'll probably
|
||||||
|
take longer to download a pre-built image than for you to create it locally.
|
||||||
|
GitHub's container registry is notoriously erratic when it comes to download
|
||||||
|
throughput.
|
||||||
|
|
||||||
### 3️⃣ Open JupyterLab
|
### Create a host work directory
|
||||||
|
|
||||||
Visit [http://localhost:8888](http://localhost:8888) in your browser — the notebook UI will open.
|
This directory will contain the Python virtual environment plus all of the
|
||||||
|
downloaded and generated data needed for training and the final trained
|
||||||
|
models. A full environment will need about 150gb of free space but read
|
||||||
|
further to see how to reduce this.
|
||||||
|
|
||||||
---
|
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
||||||
|
|
||||||
### 4️⃣ Set Your Wake Word
|
The training container will start a Bash shell so if you have Bash
|
||||||
|
aliases or Bashy things you like, create a `.bashrc` file in your
|
||||||
|
`<host_data_dir>` and put them in there. It'll automatically be included
|
||||||
|
any time you enter the container.
|
||||||
|
|
||||||
At the **top of the notebook**, find this line:
|
### Create and start the container
|
||||||
|
|
||||||
```bash
|
There are lots of options that control container creation. The simplest example
|
||||||
TARGET_WORD = "hey_tater" # Change this to your desired wake word
|
will create the container and give you an interactive shell. When you exit the
|
||||||
|
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
||||||
|
intact.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
Change `"hey_tater"` to your desired wake word (phonetic spellings often work best).
|
Options:
|
||||||
|
|
||||||
---
|
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
||||||
|
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
||||||
|
around and give it a name for training more than one wake word. You
|
||||||
|
can stop and remove it when you're ready.
|
||||||
|
* Add a `-d` option to start the container in the background and use `docker
|
||||||
|
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
||||||
|
|
||||||
### 5️⃣ Run the Notebook
|
When the container starts, you'll see:
|
||||||
|
|
||||||
Run all cells in the notebook. This process will:
|
```text
|
||||||
- Generate wake word samples
|
=======================================================
|
||||||
- Train a detection model
|
WARNING: A python virtual environment wasn't found
|
||||||
- Output a quantized `.tflite` model ready for on-device use
|
at /data/.venv. You'll need to run setup_python_venv
|
||||||
|
before you'll be able to use this container for
|
||||||
|
training.
|
||||||
|
=======================================================
|
||||||
|
root@mww-cli:/#
|
||||||
|
```
|
||||||
|
|
||||||
---
|
Don't worry about the python WARNING right now. You'll be creating the
|
||||||
|
virtualenv in the next step.
|
||||||
|
|
||||||
### 6️⃣ Retrieve the Trained Model & JSON
|
If you've forgotton to create and/or mount your host data directory, you'll
|
||||||
|
see an additional warning:
|
||||||
|
|
||||||
When training finishes, download links for both the `.tflite` model and its `.json` manifest will be displayed in the last cell.
|
```text
|
||||||
|
=======================================================
|
||||||
|
WARNING: The /data directory is NOT mounted.
|
||||||
|
Running the training process without /data mounted
|
||||||
|
could add over 140Gb of python packages and training
|
||||||
|
files to this container's storage which is probably
|
||||||
|
NOT what you want.
|
||||||
|
|
||||||
---
|
You should remove this container and re-create it with
|
||||||
|
a 'docker run' option like '-v <host_work_dir>:/data'
|
||||||
|
making sure the host directory is on a device that has
|
||||||
|
enough free space.
|
||||||
|
=======================================================
|
||||||
|
```
|
||||||
|
|
||||||
## 🔄 Resetting to a Clean State
|
You can certainly continue but it's a "really bad idea"™ because your
|
||||||
|
container storage could grow from a few hundred mb to over 140gb.
|
||||||
|
|
||||||
If you need to start fresh:
|
At this point, you're in a Bash shell.
|
||||||
|
|
||||||
1. Delete the `data` folder that was mapped to your Docker container.
|
### Create the Python virtual environment
|
||||||
2. Restart the container using the steps above.
|
|
||||||
3. A fresh copy of the notebook will be placed into the `data` directory.
|
|
||||||
|
|
||||||
---
|
The Python virtual environment will contain all the software needed to train.
|
||||||
|
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
||||||
|
|
||||||
## 🎤 Optional: Personal Voice Samples
|
The scripts that do all the work will be in the container's PATH so to setup
|
||||||
|
the virtual environment and install all of the packages, just run:
|
||||||
|
|
||||||
In addition to synthetic TTS samples, the trainer can optionally use your own real voice recordings to significantly improve accuracy for your voice and environment.
|
```text
|
||||||
|
setup_python_venv [ --verbose ]
|
||||||
|
|
||||||
### How it works
|
Options:
|
||||||
- If a folder named personal_samples/ exists and contains .wav files, the trainer will:
|
|
||||||
- Automatically extract features from those recordings
|
|
||||||
- Include them during training alongside the synthetic TTS data
|
|
||||||
- Up-weight your personal samples during training for better real-world performance
|
|
||||||
|
|
||||||
No extra flags or configuration are required — it is detected automatically.
|
--verbose: Print the detailed "pip install" output.
|
||||||
|
|
||||||
### How to use it
|
```
|
||||||
1. Create a folder in the repo root:
|
|
||||||
mkdir personal_samples
|
|
||||||
|
|
||||||
2. Record yourself saying the wake word naturally and save the files as .wav:
|
When the installation is finished, a test of the major components will be
|
||||||
personal_samples/
|
run.
|
||||||
hey_tater_01.wav
|
|
||||||
hey_tater_02.wav
|
Once the process is done, you should change to the `/data` directory and
|
||||||
hey_tater_03.wav
|
activate the virtual environment with:
|
||||||
...
|
|
||||||
|
```shell
|
||||||
|
root@mww-cli:/# cd /data
|
||||||
|
root@mww-cli:/data# source .venv/bin/activate
|
||||||
|
(.venv) root@mww-cli:/data#
|
||||||
|
```
|
||||||
|
|
||||||
|
Technically, you don't need to do either of these since the scripts
|
||||||
|
are in the PATH and they know to use the `/data` directory for everything.
|
||||||
|
It's more of an "if you're interested" thing.
|
||||||
|
|
||||||
|
At this point, you have a container with all software installed.
|
||||||
|
|
||||||
|
## Get the reference data
|
||||||
|
|
||||||
|
The training process itself relies on a significant amount of audio reference
|
||||||
|
data that creates a simulated "audio environment" that your wake word will be
|
||||||
|
trained in. These "training datasets" include things like varying amounts of
|
||||||
|
reverberation, background music, background conversations, background noise,
|
||||||
|
etc. All said and done, it amounts to about 30gb of audio but with the
|
||||||
|
downloaded archives and extracted intermediate files, you'll need about 85gb
|
||||||
|
of free space. Thankfully, you only need to download the files once no
|
||||||
|
matter how many wake words you want to train and since it's stored in
|
||||||
|
`/data`, you can even remove the docker container and recreate it without
|
||||||
|
losing any of it. There are 4 datasets that are required.
|
||||||
|
|
||||||
|
This is a three stage process...
|
||||||
|
|
||||||
|
1. Download zipfiles or tarballs. (about 30gb)
|
||||||
|
2. Extract them. (about 50gb)
|
||||||
|
3. Convert them into the final form. (about 31gb)
|
||||||
|
|
||||||
|
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
||||||
|
of the datasets doesn't need to be covnerted and is counted in both
|
||||||
|
steps 2 and 3. You really do only need 85gb.
|
||||||
|
|
||||||
|
To download the archives, unpack them, and convert the audio to what's needed
|
||||||
|
by the training process, run:
|
||||||
|
|
||||||
|
```text
|
||||||
|
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
||||||
|
they've been extracted.
|
||||||
|
|
||||||
|
--cleanup-intermediate-files: Automatically delete the intermediate files
|
||||||
|
after they've been converted.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
||||||
|
|
||||||
|
The script detects if the datasets have already been downloaded, extracted
|
||||||
|
and/or converted and skips those steps as appropriate so if you've run the
|
||||||
|
script without the cleanup options, you can just run it again with those
|
||||||
|
options to clean them up.
|
||||||
|
|
||||||
|
Now you're ready to train a wake word. Almost.
|
||||||
|
|
||||||
|
## Train a Wake Word
|
||||||
|
|
||||||
|
Training is done in 3 stages.
|
||||||
|
|
||||||
|
1. Generate thousands of samples of the wake word with various voices,
|
||||||
|
pitches, speeds, inflections, etc.
|
||||||
|
2. Augment the samples with the training datasets to add background noise, etc.
|
||||||
|
3. Run the Tensorflow training.
|
||||||
|
|
||||||
|
### Generate a sample for verification
|
||||||
|
|
||||||
|
Before you start the full process, you're going to want to generate a single
|
||||||
|
wake word sample and play it back to ensure it sounds right. The wake word
|
||||||
|
should be spelled phonetically to give the sample generator the best chance
|
||||||
|
of success.
|
||||||
|
|
||||||
|
```text
|
||||||
|
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
||||||
|
===== Generating 1 sample of 'hey buster' =====
|
||||||
|
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
||||||
|
Successfully loaded the model
|
||||||
|
Batch 1/0 complete
|
||||||
|
Done
|
||||||
|
Sample available at /data/work/test_sample/hey_buster.wav
|
||||||
|
Play it from your host.
|
||||||
|
```
|
||||||
|
|
||||||
|
You should then play that file from your host. The reason I used "hey buster"
|
||||||
|
as the wake word is to demonstrate why it's important to generate and listen
|
||||||
|
to a sample. If you try that exact input and play it back, you'll notice
|
||||||
|
that the generator didn't capture the "er" at the end very well. To get it to
|
||||||
|
do so, I had to add a period on the end as a "spacer".
|
||||||
|
"hey buster." worked much better.
|
||||||
|
|
||||||
|
When you're happy with the sample, you can run the full process.
|
||||||
|
|
||||||
|
### Run the full training process
|
||||||
|
|
||||||
|
```text
|
||||||
|
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
||||||
|
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
||||||
|
<wake_word> [ <wake_word_title> ]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--samples: The number of samples to generate for the wake word.
|
||||||
|
Default: 20000
|
||||||
|
|
||||||
|
--batch-size: How many samples should be generated at a time. The more
|
||||||
|
samples, the more memory is needed.
|
||||||
|
Default: 100
|
||||||
|
|
||||||
|
--training-steps: Number of training steps. More training steps means better
|
||||||
|
detection and false positive rates but also more time to train.
|
||||||
|
Default: 25000
|
||||||
|
|
||||||
|
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
||||||
|
Default: false
|
||||||
|
|
||||||
|
<wake_word> The word to train spelled phonetically.
|
||||||
|
Required.
|
||||||
|
|
||||||
|
<wake_word_title> An optional pretty name to save to the json metadata file.
|
||||||
|
Default: The wake word with individual words capitalized
|
||||||
|
and punctuation removed.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, the training process creates 20,000 samples of your wake word and
|
||||||
|
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
||||||
|
in the [Extra Credit](#extra-credit) section below for
|
||||||
|
why these are the defaults. Depending on resources available, this could take
|
||||||
|
between 30 and 60 minutes.
|
||||||
|
|
||||||
|
The resulting tflite model files and logs will be placed in the
|
||||||
|
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
||||||
|
and will therefore be available from your host in the directory you mapped
|
||||||
|
`/data` to. File names will have non-filename-friendly characters in your
|
||||||
|
wake word changed to underscores to make things easier. You'll need both the
|
||||||
|
tflite and json files to load on your device. Exactly how you load them
|
||||||
|
depends on the device and is beyond the scope of this project.
|
||||||
|
|
||||||
|
The only real measure of success is how well the resulting model works
|
||||||
|
on a real device. If you encounter too many missed or false activations,
|
||||||
|
increasing the number of samples would probably improve the results more
|
||||||
|
than increasing the number of training steps. See
|
||||||
|
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
||||||
|
|
||||||
|
The output from the last step is filtered some by the script but still quite
|
||||||
|
verbose. The full log will be available in the output directory as
|
||||||
|
`training.log` if you're interested. Intepreting the log is beyond the scope
|
||||||
|
of this project however.
|
||||||
|
|
||||||
|
You can train additional wake words or change the number of samples and
|
||||||
|
training steps by simply running `train_wake_word` again. No need to repeat
|
||||||
|
any of the earlier setup steps. If you change the wake word or the number of
|
||||||
|
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
||||||
|
If you only change the number of training steps, the data from the first two
|
||||||
|
steps is still valid and only the 3rd step is run.
|
||||||
|
|
||||||
|
All of the intermediate data is stored in the `/data/work` directory which will
|
||||||
|
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
||||||
|
successfully generated and you're happy with the results, you can delete the
|
||||||
|
`/data/work` directory.
|
||||||
|
|
||||||
|
### Training more than one wake word
|
||||||
|
|
||||||
|
Once you have a container running, you
|
||||||
|
can easily train multiple wake words from your host:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
for wp in "hey_alexa" "hey_jenkins" ; do
|
||||||
|
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training time examples
|
||||||
|
|
||||||
|
Training times depend on lots of things. These are examples only.
|
||||||
|
Your Mileage May Vary!!!
|
||||||
|
|
||||||
|
```text
|
||||||
|
===============================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: N/A
|
||||||
|
|
||||||
|
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
||||||
|
Augment 10000 samples Elapsed time: 0:04:05
|
||||||
|
10000 training steps Elapsed time: 0:15:04
|
||||||
|
==================================================
|
||||||
|
Total Elapsed time: 0:25:26
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||||
|
|
||||||
|
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
||||||
|
Augment 10000 samples Elapsed time: 0:03:40
|
||||||
|
10000 training steps Elapsed time: 0:08:00
|
||||||
|
======================================================
|
||||||
|
Total Elapsed time: 0:12:09
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: N/A
|
||||||
|
|
||||||
|
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
||||||
|
Augment 20000 samples Elapsed time: 0:07:04
|
||||||
|
25000 training steps Elapsed time: 0:25:21
|
||||||
|
======================================================
|
||||||
|
Total Elapsed time: 0:43:03
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||||
|
|
||||||
|
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
||||||
|
Augment 20000 samples Elapsed time: 0:07:05
|
||||||
|
25000 training steps Elapsed time: 0:19:13
|
||||||
|
======================================================
|
||||||
|
Total Elapsed time: 0:27:11
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: N/A
|
||||||
|
|
||||||
|
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
||||||
|
Augment 50000 samples Elapsed time: 0:20:22
|
||||||
|
40000 training steps Elapsed time: 1:01:51
|
||||||
|
==================================================
|
||||||
|
Total Elapsed time: 1:53:00
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
Training Summary
|
||||||
|
|
||||||
|
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
||||||
|
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
||||||
|
|
||||||
|
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
||||||
|
Augment 50000 samples Elapsed time: 0:19:13
|
||||||
|
40000 training steps Elapsed time: 0:42:23
|
||||||
|
======================================================
|
||||||
|
Total Elapsed time: 1:03:44
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
The sample generation process is really the only one that uses multiple CPUs so
|
||||||
|
having fewer CPU threads available will probably make little difference.
|
||||||
|
|
||||||
|
## Extra Credit
|
||||||
|
|
||||||
|
### Training defaults
|
||||||
|
|
||||||
|
If you plan on training multiple wake words, you can set your own default
|
||||||
|
training parameters by creating a `/data/.defaults.env` file with the
|
||||||
|
following contents:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Variable names follow the command line parameters converted to upper case
|
||||||
|
# and with the dashes ('-') converted to underscores ('_').
|
||||||
|
export SAMPLES=10000
|
||||||
|
export TRAINING_STEPS=10000
|
||||||
|
|
||||||
|
# Don't use the GPU for any operations. Stick with the CPU only.
|
||||||
|
##export CUDA_VISIBLE_DEVICES=-1
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Examine your model with Tensorboard
|
||||||
|
|
||||||
|
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
||||||
|
idea of how many training steps are needed before accuracy results stop
|
||||||
|
improving. To use it, you'll have to expose port 6006 by adding `-p
|
||||||
|
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
||||||
|
Remember, the /data directory is mapped to a directory on your host so you
|
||||||
|
can simply stop and delete the current container and recreate it with the new
|
||||||
|
`docker run` command. No need to re-run any of the setup or training steps.
|
||||||
|
|
||||||
|
To start Tensorboard, run:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
root@mww-cli:/# cd /data
|
||||||
|
root@mww-cli:/data# source .venv/bin/activate
|
||||||
|
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
||||||
|
```
|
||||||
|
|
||||||
|
Now on your host, point your browser at `http://localhost:6006/`,
|
||||||
|
click "SCALARS" at the top and take a look at the various charts. You'll see
|
||||||
|
a "train" and "validation" item for each training run you've performed. It's
|
||||||
|
the "train" items you're interested in.
|
||||||
|
|
||||||
|
<a id="tensorboard-results"></a>
|
||||||
|
|
||||||
|
You have to be a Tensorflow expert to decipher most of the charts but
|
||||||
|
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
||||||
|
seem to idicate that there's very little improvement after about 20,000
|
||||||
|
training steps.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
||||||
|
20,000 training steps.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
Given that it's faster to generate wake word samples than it is to train,
|
||||||
|
20,000 samples and 25,000 training steps seems like a good compromise. This
|
||||||
|
chart has a bit less smoothing to show a bit more detail and includes the
|
||||||
|
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
||||||
|
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
||||||
|
25,000 are the defaults for these scripts.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
3. Run the training script as normal:
|
|
||||||
|
|
||||||
If personal samples are found, you’ll see a message during training indicating they are being included.
|
|
||||||
|
|
||||||
### Recording tips
|
|
||||||
- 10–30 recordings is usually enough to see a noticeable improvement
|
|
||||||
- Vary distance, volume, and tone slightly
|
|
||||||
- Record in the same environment where the wake word will be used (room noise matters)
|
|
||||||
- Use 16-bit WAV files if possible (most recorders do this by default)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🙌 Credits
|
|
||||||
|
|
||||||
This project builds upon the excellent work of [kahrendt/microWakeWord](https://github.com/kahrendt/microWakeWord).
|
|
||||||
Huge thanks to the original authors for their contributions to the open-source community!
|
|
||||||
|
|||||||
BIN
cli/.DS_Store
vendored
Normal file
BIN
cli/.DS_Store
vendored
Normal file
Binary file not shown.
@@ -1,27 +0,0 @@
|
|||||||
# Since this is a pure python environment, we don't need to start
|
|
||||||
# with a huge CUDA image. A standard Ubuntu image will do.
|
|
||||||
FROM ubuntu:24.04
|
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive \
|
|
||||||
PYTHONUNBUFFERED=1 \
|
|
||||||
PIP_NO_CACHE_DIR=1 \
|
|
||||||
PIP_ROOT_USER_ACTION=ignore \
|
|
||||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
|
||||||
PATH="/root/mww-scripts:${PATH}"
|
|
||||||
|
|
||||||
# System deps
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
|
||||||
git wget curl unzip ca-certificates nano less \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& mkdir -p /data
|
|
||||||
|
|
||||||
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
|
||||||
COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
|
|
||||||
test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
|
|
||||||
|
|
||||||
# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash
|
|
||||||
# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
|
|
||||||
# to timeout then SIGKILL the container.
|
|
||||||
# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
|
|
||||||
CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
|
|
||||||
507
cli/README.md
507
cli/README.md
@@ -1,507 +0,0 @@
|
|||||||
# Run training from the command line
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
With these scripts and Dockerfile, you can train new wake words from the
|
|
||||||
command line without using a Jupyter notebook.
|
|
||||||
|
|
||||||
Differences between this Docker image and the Jupyter notebook image:
|
|
||||||
|
|
||||||
* The Python training environment isn't included in the image. Instead, a
|
|
||||||
"virtual environment" (venv) is created in the `/data` directory which you
|
|
||||||
will have mounted to a host directory. This cuts about 7gb from the image
|
|
||||||
and allows the virtualenv to persist across container instances.
|
|
||||||
|
|
||||||
* The logic from the Jupyter notebook is contained in individual Python
|
|
||||||
and shell scripts
|
|
||||||
|
|
||||||
* No ports need to be exposed since the Jupyter notebook server isn't being
|
|
||||||
run.
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
For the impatient among you...
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ mkdir /some/work/directory # On a device with more than 150GB free space
|
|
||||||
$ docker build -t microwakeword-cli:latest .
|
|
||||||
$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
|
|
||||||
root@mww-cli:/# cd /data
|
|
||||||
root@mww-cli:/data# setup_python_venv
|
|
||||||
##### You have about 4 minutes to drink coffee
|
|
||||||
|
|
||||||
root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
|
|
||||||
##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
|
|
||||||
|
|
||||||
root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
|
|
||||||
##### You have about 30-45 minutes for a nap depending on available system resources.
|
|
||||||
##### You'll be informed of where to find your trained model.
|
|
||||||
```
|
|
||||||
|
|
||||||
Load the trained model on your device and give it a try but don't be surprized
|
|
||||||
if you get a lot of missed or false activations. Read on to find out why.
|
|
||||||
|
|
||||||
## Get Started
|
|
||||||
|
|
||||||
Good, you stuck around! Now read the rest of the document before doing
|
|
||||||
anything.
|
|
||||||
|
|
||||||
### Using a GPU
|
|
||||||
|
|
||||||
Having an Nvidia GPU available can cut the training time by up to half. The
|
|
||||||
open-source nouveau driver shipped with Linux kernels doesn't support CUDA
|
|
||||||
however so if you have an Nvidia GPU and want to use it for training, you'll
|
|
||||||
need to install the official Nvidia driver from
|
|
||||||
https://www.nvidia.com/en-in/drivers/unix/
|
|
||||||
|
|
||||||
### Build the image
|
|
||||||
|
|
||||||
You can use either Docker or Podman as your container management tool.
|
|
||||||
`docker` is used in the examples but if you have podman, just substitute
|
|
||||||
the command.
|
|
||||||
|
|
||||||
Start by navigating to the directory that contains this README file and
|
|
||||||
the accompanying Dockerfile. Then...
|
|
||||||
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker build -t microwakeword-cli:latest .
|
|
||||||
```
|
|
||||||
|
|
||||||
This should be fairly quick and result in an image that's about 320mb in size
|
|
||||||
as it's basically a standard Ubunbtu24.04 image with a few added tools.
|
|
||||||
|
|
||||||
So why isn't a pre-built image available for download? Because it'll probably
|
|
||||||
take longer to download a pre-built image than for you to create it locally.
|
|
||||||
GitHub's container registry is notoriously erratic when it comes to download
|
|
||||||
throughput.
|
|
||||||
|
|
||||||
### Create a host work directory
|
|
||||||
|
|
||||||
This directory will contain the Python virtual environment plus all of the
|
|
||||||
downloaded and generated data needed for training and the final trained
|
|
||||||
models. A full environment will need about 150gb of free space but read
|
|
||||||
further to see how to reduce this.
|
|
||||||
|
|
||||||
Your `<host_data_dir>` will be mounted inside the container as `/data`.
|
|
||||||
|
|
||||||
The training container will start a Bash shell so if you have Bash
|
|
||||||
aliases or Bashy things you like, create a `.bashrc` file in your
|
|
||||||
`<host_data_dir>` and put them in there. It'll automatically be included
|
|
||||||
any time you enter the container.
|
|
||||||
|
|
||||||
### Create and start the container
|
|
||||||
|
|
||||||
There are lots of options that control container creation. The simplest example
|
|
||||||
will create the container and give you an interactive shell. When you exit the
|
|
||||||
shell, the container will be stopped and removed leaving your `<host_data_dir>`
|
|
||||||
intact.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ docker run -it --rm --gpus=all -v <host_work_directory>:/data microwakeword-cli:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
Options:
|
|
||||||
|
|
||||||
* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
|
|
||||||
* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
|
|
||||||
around and give it a name for training more than one wake word. You
|
|
||||||
can stop and remove it when you're ready.
|
|
||||||
* Add a `-d` option to start the container in the background and use `docker
|
|
||||||
attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
|
|
||||||
|
|
||||||
When the container starts, you'll see:
|
|
||||||
|
|
||||||
```text
|
|
||||||
=======================================================
|
|
||||||
WARNING: A python virtual environment wasn't found
|
|
||||||
at /data/.venv. You'll need to run setup_python_venv
|
|
||||||
before you'll be able to use this container for
|
|
||||||
training.
|
|
||||||
=======================================================
|
|
||||||
root@mww-cli:/#
|
|
||||||
```
|
|
||||||
|
|
||||||
Don't worry about the python WARNING right now. You'll be creating the
|
|
||||||
virtualenv in the next step.
|
|
||||||
|
|
||||||
If you've forgotton to create and/or mount your host data directory, you'll
|
|
||||||
see an additional warning:
|
|
||||||
|
|
||||||
```text
|
|
||||||
=======================================================
|
|
||||||
WARNING: The /data directory is NOT mounted.
|
|
||||||
Running the training process without /data mounted
|
|
||||||
could add over 140Gb of python packages and training
|
|
||||||
files to this container's storage which is probably
|
|
||||||
NOT what you want.
|
|
||||||
|
|
||||||
You should remove this container and re-create it with
|
|
||||||
a 'docker run' option like '-v <host_work_dir>:/data'
|
|
||||||
making sure the host directory is on a device that has
|
|
||||||
enough free space.
|
|
||||||
=======================================================
|
|
||||||
```
|
|
||||||
|
|
||||||
You can certainly continue but it's a "really bad idea"™ because your
|
|
||||||
container storage could grow from a few hundred mb to over 140gb.
|
|
||||||
|
|
||||||
At this point, you're in a Bash shell.
|
|
||||||
|
|
||||||
### Create the Python virtual environment
|
|
||||||
|
|
||||||
The Python virtual environment will contain all the software needed to train.
|
|
||||||
It gets created as `/data/.venv` and will take up about 11gb of disk space.
|
|
||||||
|
|
||||||
The scripts that do all the work will be in the container's PATH so to setup
|
|
||||||
the virtual environment and install all of the packages, just run:
|
|
||||||
|
|
||||||
```text
|
|
||||||
setup_python_venv [ --verbose ]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
|
|
||||||
--verbose: Print the detailed "pip install" output.
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
When the installation is finished, a test of the major components will be
|
|
||||||
run.
|
|
||||||
|
|
||||||
Once the process is done, you should change to the `/data` directory and
|
|
||||||
activate the virtual environment with:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
root@mww-cli:/# cd /data
|
|
||||||
root@mww-cli:/data# source .venv/bin/activate
|
|
||||||
(.venv) root@mww-cli:/data#
|
|
||||||
```
|
|
||||||
|
|
||||||
Technically, you don't need to do either of these since the scripts
|
|
||||||
are in the PATH and they know to use the `/data` directory for everything.
|
|
||||||
It's more of an "if you're interested" thing.
|
|
||||||
|
|
||||||
At this point, you have a container with all software installed.
|
|
||||||
|
|
||||||
## Get the reference data
|
|
||||||
|
|
||||||
The training process itself relies on a significant amount of audio reference
|
|
||||||
data that creates a simulated "audio environment" that your wake word will be
|
|
||||||
trained in. These "training datasets" include things like varying amounts of
|
|
||||||
reverberation, background music, background conversations, background noise,
|
|
||||||
etc. All said and done, it amounts to about 30gb of audio but with the
|
|
||||||
downloaded archives and extracted intermediate files, you'll need about 85gb
|
|
||||||
of free space. Thankfully, you only need to download the files once no
|
|
||||||
matter how many wake words you want to train and since it's stored in
|
|
||||||
`/data`, you can even remove the docker container and recreate it without
|
|
||||||
losing any of it. There are 4 datasets that are required.
|
|
||||||
|
|
||||||
This is a three stage process...
|
|
||||||
|
|
||||||
1. Download zipfiles or tarballs. (about 30gb)
|
|
||||||
2. Extract them. (about 50gb)
|
|
||||||
3. Convert them into the final form. (about 31gb)
|
|
||||||
|
|
||||||
NOTE: The sizes add up to more than the 85gb stated earlier because one
|
|
||||||
of the datasets doesn't need to be covnerted and is counted in both
|
|
||||||
steps 2 and 3. You really do only need 85gb.
|
|
||||||
|
|
||||||
To download the archives, unpack them, and convert the audio to what's needed
|
|
||||||
by the training process, run:
|
|
||||||
|
|
||||||
```text
|
|
||||||
setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--cleanup-archives: Automatically delete the tarballs or zipfiles after
|
|
||||||
they've been extracted.
|
|
||||||
|
|
||||||
--cleanup-intermediate-files: Automatically delete the intermediate files
|
|
||||||
after they've been converted.
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
On a 1gb/sec Internet connection, this will take about 25 minutes.
|
|
||||||
|
|
||||||
The script detects if the datasets have already been downloaded, extracted
|
|
||||||
and/or converted and skips those steps as appropriate so if you've run the
|
|
||||||
script without the cleanup options, you can just run it again with those
|
|
||||||
options to clean them up.
|
|
||||||
|
|
||||||
Now you're ready to train a wake word. Almost.
|
|
||||||
|
|
||||||
## Train a Wake Word
|
|
||||||
|
|
||||||
Training is done in 3 stages.
|
|
||||||
|
|
||||||
1. Generate thousands of samples of the wake word with various voices,
|
|
||||||
pitches, speeds, inflections, etc.
|
|
||||||
2. Augment the samples with the training datasets to add background noise, etc.
|
|
||||||
3. Run the Tensorflow training.
|
|
||||||
|
|
||||||
### Generate a sample for verification
|
|
||||||
|
|
||||||
Before you start the full process, you're going to want to generate a single
|
|
||||||
wake word sample and play it back to ensure it sounds right. The wake word
|
|
||||||
should be spelled phonetically to give the sample generator the best chance
|
|
||||||
of success.
|
|
||||||
|
|
||||||
```text
|
|
||||||
root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
|
|
||||||
===== Generating 1 sample of 'hey buster' =====
|
|
||||||
Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
|
|
||||||
Successfully loaded the model
|
|
||||||
Batch 1/0 complete
|
|
||||||
Done
|
|
||||||
Sample available at /data/work/test_sample/hey_buster.wav
|
|
||||||
Play it from your host.
|
|
||||||
```
|
|
||||||
|
|
||||||
You should then play that file from your host. The reason I used "hey buster"
|
|
||||||
as the wake word is to demonstrate why it's important to generate and listen
|
|
||||||
to a sample. If you try that exact input and play it back, you'll notice
|
|
||||||
that the generator didn't capture the "er" at the end very well. To get it to
|
|
||||||
do so, I had to add a period on the end as a "spacer".
|
|
||||||
"hey buster." worked much better.
|
|
||||||
|
|
||||||
When you're happy with the sample, you can run the full process.
|
|
||||||
|
|
||||||
### Run the full training process
|
|
||||||
|
|
||||||
```text
|
|
||||||
train_wake_word [ --samples=<samples> ] [ --batch-size=<batch_size> ]
|
|
||||||
[ --training-steps=<steps> ] [ --cleanup-work-dir ]
|
|
||||||
<wake_word> [ <wake_word_title> ]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--samples: The number of samples to generate for the wake word.
|
|
||||||
Default: 20000
|
|
||||||
|
|
||||||
--batch-size: How many samples should be generated at a time. The more
|
|
||||||
samples, the more memory is needed.
|
|
||||||
Default: 100
|
|
||||||
|
|
||||||
--training-steps: Number of training steps. More training steps means better
|
|
||||||
detection and false positive rates but also more time to train.
|
|
||||||
Default: 25000
|
|
||||||
|
|
||||||
--cleanup-work-dir: Delete the /data/work directory after successful training.
|
|
||||||
Default: false
|
|
||||||
|
|
||||||
<wake_word> The word to train spelled phonetically.
|
|
||||||
Required.
|
|
||||||
|
|
||||||
<wake_word_title> An optional pretty name to save to the json metadata file.
|
|
||||||
Default: The wake word with individual words capitalized
|
|
||||||
and punctuation removed.
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
By default, the training process creates 20,000 samples of your wake word and
|
|
||||||
runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
|
|
||||||
in the [Extra Credit](#extra-credit) section below for
|
|
||||||
why these are the defaults. Depending on resources available, this could take
|
|
||||||
between 30 and 60 minutes.
|
|
||||||
|
|
||||||
The resulting tflite model files and logs will be placed in the
|
|
||||||
`/data/output/<timestamp>-<wake_word>-<samples>-<training-steps>` directory
|
|
||||||
and will therefore be available from your host in the directory you mapped
|
|
||||||
`/data` to. File names will have non-filename-friendly characters in your
|
|
||||||
wake word changed to underscores to make things easier. You'll need both the
|
|
||||||
tflite and json files to load on your device. Exactly how you load them
|
|
||||||
depends on the device and is beyond the scope of this project.
|
|
||||||
|
|
||||||
The only real measure of success is how well the resulting model works
|
|
||||||
on a real device. If you encounter too many missed or false activations,
|
|
||||||
increasing the number of samples would probably improve the results more
|
|
||||||
than increasing the number of training steps. See
|
|
||||||
[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
|
|
||||||
|
|
||||||
The output from the last step is filtered some by the script but still quite
|
|
||||||
verbose. The full log will be available in the output directory as
|
|
||||||
`training.log` if you're interested. Intepreting the log is beyond the scope
|
|
||||||
of this project however.
|
|
||||||
|
|
||||||
You can train additional wake words or change the number of samples and
|
|
||||||
training steps by simply running `train_wake_word` again. No need to repeat
|
|
||||||
any of the earlier setup steps. If you change the wake word or the number of
|
|
||||||
wake word samples, the work directory will be deleted and all 3 steps re-run.
|
|
||||||
If you only change the number of training steps, the data from the first two
|
|
||||||
steps is still valid and only the 3rd step is run.
|
|
||||||
|
|
||||||
All of the intermediate data is stored in the `/data/work` directory which will
|
|
||||||
grow to about 17gb with 20,000 wake word samples. Once the tflite model is
|
|
||||||
successfully generated and you're happy with the results, you can delete the
|
|
||||||
`/data/work` directory.
|
|
||||||
|
|
||||||
### Training more than one wake word
|
|
||||||
|
|
||||||
Once you have a container running, you
|
|
||||||
can easily train multiple wake words from your host:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
for wp in "hey_alexa" "hey_jenkins" ; do
|
|
||||||
docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
### Training time examples
|
|
||||||
|
|
||||||
Training times depend on lots of things. These are examples only.
|
|
||||||
Your Mileage May Vary!!!
|
|
||||||
|
|
||||||
```text
|
|
||||||
===============================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: N/A
|
|
||||||
|
|
||||||
Generate 10000 samples, 100/batch Elapsed time: 0:06:17
|
|
||||||
Augment 10000 samples Elapsed time: 0:04:05
|
|
||||||
10000 training steps Elapsed time: 0:15:04
|
|
||||||
==================================================
|
|
||||||
Total Elapsed time: 0:25:26
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
|
||||||
|
|
||||||
Generate 10000 samples, 100/batch Elapsed time: 0:00:29
|
|
||||||
Augment 10000 samples Elapsed time: 0:03:40
|
|
||||||
10000 training steps Elapsed time: 0:08:00
|
|
||||||
======================================================
|
|
||||||
Total Elapsed time: 0:12:09
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: N/A
|
|
||||||
|
|
||||||
Generate 20000 samples, 100/batch Elapsed time: 0:10:38
|
|
||||||
Augment 20000 samples Elapsed time: 0:07:04
|
|
||||||
25000 training steps Elapsed time: 0:25:21
|
|
||||||
======================================================
|
|
||||||
Total Elapsed time: 0:43:03
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
|
||||||
|
|
||||||
Generate 20000 samples, 100/batch Elapsed time: 0:00:53
|
|
||||||
Augment 20000 samples Elapsed time: 0:07:05
|
|
||||||
25000 training steps Elapsed time: 0:19:13
|
|
||||||
======================================================
|
|
||||||
Total Elapsed time: 0:27:11
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: N/A
|
|
||||||
|
|
||||||
Generate 50000 samples, 100/batch Elapsed time: 0:30:47
|
|
||||||
Augment 50000 samples Elapsed time: 0:20:22
|
|
||||||
40000 training steps Elapsed time: 1:01:51
|
|
||||||
==================================================
|
|
||||||
Total Elapsed time: 1:53:00
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
================================================================================
|
|
||||||
Training Summary
|
|
||||||
|
|
||||||
CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
|
|
||||||
GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
|
|
||||||
|
|
||||||
Generate 50000 samples, 100/batch Elapsed time: 0:02:08
|
|
||||||
Augment 50000 samples Elapsed time: 0:19:13
|
|
||||||
40000 training steps Elapsed time: 0:42:23
|
|
||||||
======================================================
|
|
||||||
Total Elapsed time: 1:03:44
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
The sample generation process is really the only one that uses multiple CPUs so
|
|
||||||
having fewer CPU threads available will probably make little difference.
|
|
||||||
|
|
||||||
## Extra Credit
|
|
||||||
|
|
||||||
### Training defaults
|
|
||||||
|
|
||||||
If you plan on training multiple wake words, you can set your own default
|
|
||||||
training parameters by creating a `/data/.defaults.env` file with the
|
|
||||||
following contents:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
# Variable names follow the command line parameters converted to upper case
|
|
||||||
# and with the dashes ('-') converted to underscores ('_').
|
|
||||||
export SAMPLES=10000
|
|
||||||
export TRAINING_STEPS=10000
|
|
||||||
|
|
||||||
# Don't use the GPU for any operations. Stick with the CPU only.
|
|
||||||
##export CUDA_VISIBLE_DEVICES=-1
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
### Examine your model with Tensorboard
|
|
||||||
|
|
||||||
Tensorboard is a web-based graphical model viewer. You can use it to get an
|
|
||||||
idea of how many training steps are needed before accuracy results stop
|
|
||||||
improving. To use it, you'll have to expose port 6006 by adding `-p
|
|
||||||
6006:6006` to your `docker run` command line. If you didn't, don't worry.
|
|
||||||
Remember, the /data directory is mapped to a directory on your host so you
|
|
||||||
can simply stop and delete the current container and recreate it with the new
|
|
||||||
`docker run` command. No need to re-run any of the setup or training steps.
|
|
||||||
|
|
||||||
To start Tensorboard, run:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
root@mww-cli:/# cd /data
|
|
||||||
root@mww-cli:/data# source .venv/bin/activate
|
|
||||||
(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
|
|
||||||
```
|
|
||||||
|
|
||||||
Now on your host, point your browser at `http://localhost:6006/`,
|
|
||||||
click "SCALARS" at the top and take a look at the various charts. You'll see
|
|
||||||
a "train" and "validation" item for each training run you've performed. It's
|
|
||||||
the "train" items you're interested in.
|
|
||||||
|
|
||||||
<a id="tensorboard-results"></a>
|
|
||||||
|
|
||||||
You have to be a Tensorflow expert to decipher most of the charts but
|
|
||||||
the "Accuracy" chart for this particular wake word and 50,000 samples would
|
|
||||||
seem to idicate that there's very little improvement after about 20,000
|
|
||||||
training steps.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
In contrast, with only 5,000 wake word samples, there's still improvement to be had after
|
|
||||||
20,000 training steps.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
Given that it's faster to generate wake word samples than it is to train,
|
|
||||||
20,000 samples and 25,000 training steps seems like a good compromise. This
|
|
||||||
chart has a bit less smoothing to show a bit more detail and includes the
|
|
||||||
50,000 sample run as well. This run took only 27 minutes as opposed to the
|
|
||||||
63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
|
|
||||||
25,000 are the defaults for these scripts.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
# --- Packages needed by our scripts ---
|
|
||||||
|
|
||||||
numpy==1.26.4
|
|
||||||
scipy==1.12.0
|
|
||||||
librosa==0.10.2.post1
|
|
||||||
soundfile==0.12.1
|
|
||||||
tqdm==4.67.1
|
|
||||||
scikit-learn==1.6.0
|
|
||||||
numba==0.63.1
|
|
||||||
PyYAML==6.0.3
|
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
PROGDIR="$(dirname $(realpath $0))"
|
PROGDIR="$(dirname "$(realpath "$0")")"
|
||||||
|
ROOTDIR="$(dirname "${PROGDIR}")"
|
||||||
|
|
||||||
KNOWN_ARGS=( data-dir python gpu no-gpu )
|
KNOWN_ARGS=( data-dir python gpu no-gpu )
|
||||||
source "${PROGDIR}/shell.functions"
|
source "${PROGDIR}/shell.functions"
|
||||||
@@ -27,7 +28,7 @@ EOF
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
|
[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
|
||||||
[ -d "${DATA_DIR}" ] || {
|
[ -d "${DATA_DIR}" ] || {
|
||||||
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||||
exit 1
|
exit 1
|
||||||
@@ -52,7 +53,8 @@ if [ -n "${PYTHON}" ] ; then
|
|||||||
PYTHONS=( "${PYTHON}" )
|
PYTHONS=( "${PYTHON}" )
|
||||||
unset PYTHON
|
unset PYTHON
|
||||||
else
|
else
|
||||||
PYTHONS=( python3.12 python3.10 )
|
# Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04)
|
||||||
|
PYTHONS=( python3.12 python3.11 python3.10 )
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for p in "${PYTHONS[@]}" ; do
|
for p in "${PYTHONS[@]}" ; do
|
||||||
@@ -60,14 +62,14 @@ for p in "${PYTHONS[@]}" ; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
[ -n "${PYTHON}" ] || {
|
[ -n "${PYTHON}" ] || {
|
||||||
echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2
|
echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ -d "${VENV}" ] ; then
|
if [ -d "${VENV}" ] ; then
|
||||||
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
|
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
|
||||||
source "${VENV}/bin/activate" || {
|
source "${VENV}/bin/activate" || {
|
||||||
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -82,24 +84,28 @@ if [ -z "$VIRTUAL_ENV" ] ; then
|
|||||||
else
|
else
|
||||||
echo " ===== Updating virtualenv at '${VENV}' ====="
|
echo " ===== Updating virtualenv at '${VENV}' ====="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
${PYTHON} -m venv --upgrade-deps "${VENV}"
|
${PYTHON} -m venv --upgrade-deps "${VENV}"
|
||||||
source "${VENV}/bin/activate"
|
source "${VENV}/bin/activate"
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
|
# Symlink CLI scripts into .venv/bin
|
||||||
|
declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) )
|
||||||
progfiles+=( "${PROGDIR}/shell.functions" )
|
progfiles+=( "${PROGDIR}/shell.functions" )
|
||||||
|
|
||||||
|
# Also symlink the top-level entrypoint if present
|
||||||
|
[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" )
|
||||||
|
|
||||||
for f in "${progfiles[@]}" ; do
|
for f in "${progfiles[@]}" ; do
|
||||||
ln -sfr "${f}" ".venv/bin/$(basename ${f})"
|
ln -sfr "${f}" ".venv/bin/$(basename "${f}")"
|
||||||
done
|
done
|
||||||
|
|
||||||
#
|
#
|
||||||
# Pip doesn't process packages from requirements.txt in
|
# Pip doesn't process packages from requirements.txt in order but order is
|
||||||
# order but order is important because tensorflow, torch,
|
# important because tensorflow, torch, onnxruntime and micro-wake-word all
|
||||||
# onnxruntime and micro-wake-word all depend on CUDA packages
|
# depend on CUDA packages at various versions. They need to be installed in
|
||||||
# at various versions. They need to be installed in this specific
|
# this specific order or they may not be able to use the GPU.
|
||||||
# order or they may not be able to use the GPU.
|
|
||||||
#
|
#
|
||||||
export PIP_PROGRESS_BAR=off
|
export PIP_PROGRESS_BAR=off
|
||||||
export PIP_NO_COLOR=1
|
export PIP_NO_COLOR=1
|
||||||
@@ -117,7 +123,8 @@ pip_install() {
|
|||||||
START_TS=$EPOCHSECONDS
|
START_TS=$EPOCHSECONDS
|
||||||
|
|
||||||
echo " ===== Installing common requirements ====="
|
echo " ===== Installing common requirements ====="
|
||||||
pip_install -r "${PROGDIR}/requirements.txt"
|
# requirements.txt lives in repo root now
|
||||||
|
pip_install -r "${ROOTDIR}/requirements.txt"
|
||||||
|
|
||||||
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
|
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
|
||||||
echo " ===== Installing Tensorflow${tfgpu} ====="
|
echo " ===== Installing Tensorflow${tfgpu} ====="
|
||||||
@@ -140,7 +147,7 @@ pip_install -e "${MWW}"
|
|||||||
|
|
||||||
echo " ===== Checking piper-sample-generator ====="
|
echo " ===== Checking piper-sample-generator ====="
|
||||||
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
PSG="${DATA_DIR}/tools/piper-sample-generator"
|
||||||
if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
|
if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then
|
||||||
rm -rf "${PSG}" || :
|
rm -rf "${PSG}" || :
|
||||||
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
|
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
|
||||||
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
|
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
|
||||||
@@ -171,7 +178,7 @@ echo " ===== Installing keras ====="
|
|||||||
# keras 3.13 has "issues" so we need to back down to 3.12.
|
# keras 3.13 has "issues" so we need to back down to 3.12.
|
||||||
pip_install "keras==3.12.0"
|
pip_install "keras==3.12.0"
|
||||||
|
|
||||||
${PROGDIR}/test_python --data-dir="${DATA_DIR}"
|
"${PROGDIR}/test_python" --data-dir="${DATA_DIR}"
|
||||||
|
|
||||||
touch .mww-data-dir
|
touch .mww-data-dir
|
||||||
END_TS=$EPOCHSECONDS
|
END_TS=$EPOCHSECONDS
|
||||||
@@ -179,5 +186,3 @@ END_TS=$EPOCHSECONDS
|
|||||||
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
|
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
|
||||||
|
|
||||||
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
|
print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
PROGPATH=$(realpath "$0")
|
PROGPATH="$(realpath "$0")"
|
||||||
PROGDIR=$(dirname "${PROGPATH}")
|
PROGDIR="$(dirname "${PROGPATH}")"
|
||||||
|
ROOTDIR="$(dirname "${PROGDIR}")" # repo root (train_wake_word, requirements.txt, etc.)
|
||||||
|
|
||||||
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
|
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
|
||||||
source "${PROGDIR}/shell.functions"
|
source "${PROGDIR}/shell.functions"
|
||||||
@@ -27,22 +28,38 @@ EOF
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Normalize + validate DATA_DIR (shell.functions typically sets a default,
|
||||||
|
# but this makes the script standalone-safe)
|
||||||
|
[ -n "${DATA_DIR:-}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
|
||||||
|
[ -d "${DATA_DIR}" ] || {
|
||||||
|
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
cd "${DATA_DIR}"
|
cd "${DATA_DIR}"
|
||||||
|
|
||||||
START_TS=$EPOCHSECONDS
|
START_TS=$EPOCHSECONDS
|
||||||
echo -e "\n===== Setting up Training Datasets =====\n"
|
echo -e "\n===== Setting up Training Datasets =====\n"
|
||||||
|
|
||||||
${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
|
"${PROGDIR}/setup_negative_datasets" \
|
||||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||||
|
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||||
|
--data-dir="${DATA_DIR}"
|
||||||
|
|
||||||
${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
|
"${PROGDIR}/setup_mit_audio" \
|
||||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||||
|
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||||
|
--data-dir="${DATA_DIR}"
|
||||||
|
|
||||||
${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
|
"${PROGDIR}/setup_audioset" \
|
||||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||||
|
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||||
|
--data-dir="${DATA_DIR}"
|
||||||
|
|
||||||
${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
|
"${PROGDIR}/setup_fma" \
|
||||||
--cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
|
--cleanup-archives="${CLEANUP_ARCHIVES}" \
|
||||||
|
--cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
|
||||||
|
--data-dir="${DATA_DIR}"
|
||||||
|
|
||||||
END_TS=$(date +%s.%N)
|
END_TS=$EPOCHSECONDS
|
||||||
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
|
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 20 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 32 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 43 KiB |
0
cli/wake_word_sample_augmenter
Executable file → Normal file
0
cli/wake_word_sample_augmenter
Executable file → Normal file
0
cli/wake_word_sample_trainer
Executable file → Normal file
0
cli/wake_word_sample_trainer
Executable file → Normal file
76
dockerfile
76
dockerfile
@@ -1,59 +1,37 @@
|
|||||||
# Standard Ubuntu base image. CUDA base images not needed.
|
# Base
|
||||||
FROM ubuntu:22.04
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive \
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
PYTHONUNBUFFERED=1 \
|
|
||||||
PIP_NO_CACHE_DIR=1 \
|
|
||||||
PIP_ROOT_USER_ACTION=ignore \
|
|
||||||
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
|
||||||
XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" \
|
|
||||||
PATH="/usr/local/cuda/bin:${PATH}" \
|
|
||||||
LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
|
|
||||||
|
|
||||||
# System deps (+dev headers for building C/C++ extensions)
|
# System deps
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
python3.10 python3.10-venv python3.10-distutils python3.10-dev python3-pip \
|
python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
|
||||||
git wget curl unzip ca-certificates git-lfs \
|
git wget curl unzip ca-certificates nano less \
|
||||||
build-essential g++ cmake \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
libsndfile1 libsndfile1-dev libffi-dev \
|
&& mkdir -p /data
|
||||||
ffmpeg \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Use python3.10 everywhere
|
# Recorder port
|
||||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \
|
EXPOSE 8789
|
||||||
&& update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
|
|
||||||
|
|
||||||
# ---- No cuDNN repo meddling needed if using TF 2.17.x ----
|
# Script root
|
||||||
|
WORKDIR /root/mww-scripts
|
||||||
|
|
||||||
# Python deps
|
# Bash environment
|
||||||
# Order is important. onnxruntime, tensorflow and torch have
|
COPY --chown=root:root --chmod=0755 .bashrc /root/
|
||||||
# to be installed in the order below or their cuda dependencies
|
|
||||||
# will conflict.
|
|
||||||
COPY requirements.txt /tmp/requirements.txt
|
|
||||||
RUN pip install --upgrade pip \
|
|
||||||
&& pip install "numpy==1.26.4" "cython>=0.29.36" \
|
|
||||||
&& pip install -r /tmp/requirements.txt \
|
|
||||||
&& pip install "onnxruntime-gpu[cuda]>=1.16.0" \
|
|
||||||
&& pip install "tensorflow[and-cuda]==2.18.0" \
|
|
||||||
"tensorboard==2.18.0" \
|
|
||||||
"tensorboard-data-server==0.7.2" \
|
|
||||||
"tensorflow-io-gcs-filesystem==0.37.1" \
|
|
||||||
&& pip install \
|
|
||||||
torch==2.7.1 \
|
|
||||||
torchaudio==2.7.1 \
|
|
||||||
--index-url https://download.pytorch.org/whl/cu128
|
|
||||||
|
|
||||||
# Workspace + notebook fallback
|
# Root-level entrypoints
|
||||||
RUN mkdir -p /data
|
COPY --chown=root:root --chmod=0755 \
|
||||||
WORKDIR /data
|
train_wake_word \
|
||||||
COPY microWakeWord_training_notebook.ipynb /root/
|
run_recorder.sh \
|
||||||
|
recorder_server.py \
|
||||||
|
requirements.txt \
|
||||||
|
/root/mww-scripts/
|
||||||
|
|
||||||
# Startup script (copies default notebook if missing)
|
# CLI folder (THIS IS THE IMPORTANT CHANGE)
|
||||||
COPY startup.sh /usr/local/bin/startup.sh
|
COPY --chown=root:root cli/ /root/mww-scripts/cli/
|
||||||
RUN chmod +x /usr/local/bin/startup.sh
|
|
||||||
|
|
||||||
EXPOSE 8888
|
# Static UI for recorder
|
||||||
|
COPY --chown=root:root --chmod=0644 static/index.html /root/mww-scripts/static/index.html
|
||||||
|
|
||||||
CMD ["/bin/bash", "-lc", "/usr/local/bin/startup.sh && \
|
# recorder server
|
||||||
exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root \
|
CMD ["/bin/bash", "-lc", "/root/mww-scripts/run_recorder.sh"]
|
||||||
--ServerApp.token='' --ServerApp.password='' --ServerApp.root_dir=/data"]
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
593
recorder_server.py
Normal file
593
recorder_server.py
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
# recorder_server.py
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, List, Optional, Tuple
|
||||||
|
|
||||||
|
from fastapi import FastAPI, UploadFile, File, Form, Query
|
||||||
|
from fastapi.responses import HTMLResponse, JSONResponse
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
|
ROOT_DIR = Path(__file__).resolve().parent
|
||||||
|
|
||||||
|
# In Docker CLI world, DATA_DIR should be /data
|
||||||
|
DATA_DIR = Path(os.environ.get("DATA_DIR", "/data")).resolve()
|
||||||
|
|
||||||
|
# UI files live next to this script by default
|
||||||
|
STATIC_DIR = Path(os.environ.get("STATIC_DIR", str(ROOT_DIR / "static"))).resolve()
|
||||||
|
|
||||||
|
# Personal samples MUST land in /data/personal_samples for your CLI pipeline
|
||||||
|
PERSONAL_DIR = Path(os.environ.get("PERSONAL_DIR", str(DATA_DIR / "personal_samples"))).resolve()
|
||||||
|
|
||||||
|
# CLI folder inside repo
|
||||||
|
CLI_DIR = Path(os.environ.get("CLI_DIR", str(ROOT_DIR / "cli"))).resolve()
|
||||||
|
|
||||||
|
# If you want cleanup defaults for auto dataset setup, set these env vars:
|
||||||
|
# REC_DATASET_CLEANUP_ARCHIVES=true/false
|
||||||
|
# REC_DATASET_CLEANUP_INTERMEDIATE_FILES=true/false
|
||||||
|
DATASET_CLEANUP_ARCHIVES = os.environ.get("REC_DATASET_CLEANUP_ARCHIVES", "false").lower() in ("1", "true", "yes", "y")
|
||||||
|
DATASET_CLEANUP_INTERMEDIATE = os.environ.get("REC_DATASET_CLEANUP_INTERMEDIATE_FILES", "false").lower() in ("1", "true", "yes", "y")
|
||||||
|
|
||||||
|
# We want "Start training" to trigger your CLI entrypoint, using the existing venv
|
||||||
|
# (train_wake_word should be in /data/.venv/bin via setup_python_venv)
|
||||||
|
TRAIN_CMD = os.environ.get(
|
||||||
|
"TRAIN_CMD",
|
||||||
|
f"source '{DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir '{DATA_DIR}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
TAKES_PER_SPEAKER_DEFAULT = int(os.environ.get("REC_TAKES_PER_SPEAKER", "10"))
|
||||||
|
SPEAKERS_TOTAL_DEFAULT = int(os.environ.get("REC_SPEAKERS_TOTAL", "1"))
|
||||||
|
|
||||||
|
# How many lines to show in WebUI (tail)
|
||||||
|
TRAIN_LOG_TAIL_LINES = int(os.environ.get("REC_TRAIN_LOG_TAIL_LINES", "400"))
|
||||||
|
# If you prefer bytes-based tailing (fast), keep this non-zero.
|
||||||
|
TRAIN_LOG_MAX_BYTES = int(os.environ.get("REC_TRAIN_LOG_MAX_BYTES", str(512 * 1024))) # 512KB
|
||||||
|
|
||||||
|
app = FastAPI(title="microWakeWord Personal Recorder")
|
||||||
|
|
||||||
|
# Serve static UI
|
||||||
|
STATIC_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
||||||
|
|
||||||
|
|
||||||
|
def safe_name(raw: str) -> str:
|
||||||
|
s = (raw or "").strip().lower()
|
||||||
|
s = re.sub(r"\s+", "_", s)
|
||||||
|
s = re.sub(r"[^a-z0-9_]+", "", s)
|
||||||
|
s = re.sub(r"^_+|_+$", "", s)
|
||||||
|
return s or "wakeword"
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------- In-memory session state --------------------
|
||||||
|
STATE: Dict[str, Any] = {
|
||||||
|
"raw_phrase": None,
|
||||||
|
"safe_word": None,
|
||||||
|
|
||||||
|
"speakers_total": SPEAKERS_TOTAL_DEFAULT,
|
||||||
|
"takes_per_speaker": TAKES_PER_SPEAKER_DEFAULT,
|
||||||
|
|
||||||
|
"takes_received": 0,
|
||||||
|
"takes": [],
|
||||||
|
|
||||||
|
"training": {
|
||||||
|
"running": False,
|
||||||
|
"exit_code": None,
|
||||||
|
"log_lines": [], # legacy in-memory tail (still maintained)
|
||||||
|
"log_path": None, # path to recorder_training.log
|
||||||
|
"safe_word": None,
|
||||||
|
|
||||||
|
# NEW: byte offset for efficient log tailing
|
||||||
|
"log_offset": 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
STATE_LOCK = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _reset_personal_samples_dir():
|
||||||
|
PERSONAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
for p in PERSONAL_DIR.glob("*.wav"):
|
||||||
|
try:
|
||||||
|
p.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _append_train_log(line: str):
|
||||||
|
line = (line or "").rstrip("\n")
|
||||||
|
with STATE_LOCK:
|
||||||
|
buf: List[str] = STATE["training"]["log_lines"]
|
||||||
|
buf.append(line)
|
||||||
|
if len(buf) > 250:
|
||||||
|
del buf[: (len(buf) - 250)]
|
||||||
|
|
||||||
|
|
||||||
|
def _title_from_phrase(raw_phrase: str) -> str:
|
||||||
|
# Keep it human-friendly for the optional <wake_word_title> argument
|
||||||
|
s = re.sub(r"[^a-zA-Z0-9 ]+", " ", raw_phrase or "").strip()
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s.title() if s else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _run_streamed(
|
||||||
|
cmd: List[str],
|
||||||
|
cwd: Path,
|
||||||
|
log_path: Path,
|
||||||
|
header: Optional[str] = None,
|
||||||
|
env: Optional[Dict[str, str]] = None,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Run a command streaming stdout/stderr to both:
|
||||||
|
- recorder_training.log (disk)
|
||||||
|
- STATE["training"]["log_lines"] (UI) [best-effort]
|
||||||
|
Returns process exit code.
|
||||||
|
"""
|
||||||
|
if header:
|
||||||
|
_append_train_log(header)
|
||||||
|
|
||||||
|
_append_train_log("→ " + " ".join(cmd))
|
||||||
|
|
||||||
|
with open(log_path, "a", encoding="utf-8") as lf:
|
||||||
|
lf.write("\n" + ("=" * 80) + "\n")
|
||||||
|
if header:
|
||||||
|
lf.write(header + "\n")
|
||||||
|
lf.write("→ " + " ".join(cmd) + "\n")
|
||||||
|
lf.flush()
|
||||||
|
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
cwd=str(cwd),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert proc.stdout is not None
|
||||||
|
for line in proc.stdout:
|
||||||
|
lf.write(line)
|
||||||
|
lf.flush()
|
||||||
|
_append_train_log(line)
|
||||||
|
|
||||||
|
return proc.wait()
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_training_venv(log_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Ensure /data/.venv exists by running cli/setup_python_venv if needed.
|
||||||
|
"""
|
||||||
|
activate = DATA_DIR / ".venv" / "bin" / "activate"
|
||||||
|
if activate.exists():
|
||||||
|
_append_train_log("✅ Training venv found (skipping setup_python_venv)")
|
||||||
|
return
|
||||||
|
|
||||||
|
setup = CLI_DIR / "setup_python_venv"
|
||||||
|
if not setup.exists():
|
||||||
|
raise RuntimeError(f"Missing setup_python_venv at: {setup}")
|
||||||
|
|
||||||
|
rc = _run_streamed(
|
||||||
|
["bash", "-lc", f"cd '{DATA_DIR}' && '{setup}' --data-dir='{DATA_DIR}'"],
|
||||||
|
cwd=DATA_DIR,
|
||||||
|
log_path=log_path,
|
||||||
|
header="===== Ensuring Python venv (/data/.venv) =====",
|
||||||
|
)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
raise RuntimeError(f"setup_python_venv failed (exit_code={rc})")
|
||||||
|
|
||||||
|
if not activate.exists():
|
||||||
|
raise RuntimeError(f"setup_python_venv finished, but {activate} is still missing")
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_training_datasets(log_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Always run setup_training_datasets before training.
|
||||||
|
The underlying scripts should skip work when already done.
|
||||||
|
"""
|
||||||
|
setup = CLI_DIR / "setup_training_datasets"
|
||||||
|
if not setup.exists():
|
||||||
|
raise RuntimeError(f"Missing setup_training_datasets at: {setup}")
|
||||||
|
|
||||||
|
cleanup_arch = "true" if DATASET_CLEANUP_ARCHIVES else "false"
|
||||||
|
cleanup_inter = "true" if DATASET_CLEANUP_INTERMEDIATE else "false"
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"bash",
|
||||||
|
"-lc",
|
||||||
|
(
|
||||||
|
f"cd '{DATA_DIR}' && "
|
||||||
|
f"'{setup}' "
|
||||||
|
f"--cleanup-archives='{cleanup_arch}' "
|
||||||
|
f"--cleanup-intermediate-files='{cleanup_inter}' "
|
||||||
|
f"--data-dir='{DATA_DIR}'"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
rc = _run_streamed(
|
||||||
|
cmd,
|
||||||
|
cwd=DATA_DIR,
|
||||||
|
log_path=log_path,
|
||||||
|
header="===== Ensuring training datasets (setup_training_datasets) =====",
|
||||||
|
)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
raise RuntimeError(f"setup_training_datasets failed (exit_code={rc})")
|
||||||
|
|
||||||
|
|
||||||
|
def _read_log_tail_by_bytes(log_path: Path, max_bytes: int) -> str:
|
||||||
|
"""
|
||||||
|
Read up to the last max_bytes from a file (UTF-8 best effort).
|
||||||
|
"""
|
||||||
|
if not log_path.exists():
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
size = log_path.stat().st_size
|
||||||
|
start = max(0, size - max_bytes)
|
||||||
|
with open(log_path, "rb") as f:
|
||||||
|
f.seek(start)
|
||||||
|
data = f.read()
|
||||||
|
# If we started in the middle of a line, it's ok; UI will show partial.
|
||||||
|
return data.decode("utf-8", errors="replace")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _read_log_tail_by_lines(log_path: Path, max_lines: int) -> str:
|
||||||
|
"""
|
||||||
|
Read last N lines of a file (simple, may be slower on huge files).
|
||||||
|
"""
|
||||||
|
if not log_path.exists():
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
# Read by bytes limited first, then line-tail
|
||||||
|
raw = _read_log_tail_by_bytes(log_path, TRAIN_LOG_MAX_BYTES)
|
||||||
|
if not raw:
|
||||||
|
return ""
|
||||||
|
lines = raw.splitlines()
|
||||||
|
if len(lines) <= max_lines:
|
||||||
|
return "\n".join(lines)
|
||||||
|
return "\n".join(lines[-max_lines:])
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _read_log_since_offset(log_path: Path, offset: int, max_bytes: int = 256 * 1024) -> Tuple[str, int]:
|
||||||
|
"""
|
||||||
|
Read log file incrementally starting from `offset`.
|
||||||
|
Returns (new_text, new_offset). Caps bytes read per call.
|
||||||
|
"""
|
||||||
|
if not log_path.exists():
|
||||||
|
return ("", offset)
|
||||||
|
|
||||||
|
try:
|
||||||
|
size = log_path.stat().st_size
|
||||||
|
# If file rotated/truncated, reset offset
|
||||||
|
if offset > size:
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
with open(log_path, "rb") as f:
|
||||||
|
f.seek(offset)
|
||||||
|
data = f.read(max_bytes)
|
||||||
|
|
||||||
|
new_offset = offset + len(data)
|
||||||
|
text = data.decode("utf-8", errors="replace")
|
||||||
|
return (text, new_offset)
|
||||||
|
except Exception:
|
||||||
|
return ("", offset)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_training_background(safe_word: str, allow_no_personal: bool):
|
||||||
|
with STATE_LOCK:
|
||||||
|
raw_phrase = STATE.get("raw_phrase") or ""
|
||||||
|
|
||||||
|
wake_word_title = _title_from_phrase(raw_phrase)
|
||||||
|
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["training"]["running"] = True
|
||||||
|
STATE["training"]["exit_code"] = None
|
||||||
|
STATE["training"]["log_lines"] = []
|
||||||
|
STATE["training"]["safe_word"] = safe_word
|
||||||
|
log_path = Path(str(DATA_DIR / "recorder_training.log"))
|
||||||
|
STATE["training"]["log_path"] = str(log_path)
|
||||||
|
STATE["training"]["log_offset"] = 0
|
||||||
|
|
||||||
|
# fresh header at the start of a run
|
||||||
|
_append_train_log("================================================================================")
|
||||||
|
_append_train_log("===== Recorder Training Run =====")
|
||||||
|
_append_train_log("================================================================================")
|
||||||
|
|
||||||
|
# Ensure the log exists and starts cleanly with a header separator for this run
|
||||||
|
try:
|
||||||
|
with open(log_path, "a", encoding="utf-8") as lf:
|
||||||
|
lf.write("\n" + ("=" * 80) + "\n")
|
||||||
|
lf.write("===== Recorder Training Run =====\n")
|
||||||
|
lf.write(("=" * 80) + "\n")
|
||||||
|
lf.flush()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1) Ensure venv (auto-installs)
|
||||||
|
_ensure_training_venv(log_path)
|
||||||
|
|
||||||
|
# 2) Ensure datasets (auto-installs / skips if already present)
|
||||||
|
_ensure_training_datasets(log_path)
|
||||||
|
|
||||||
|
# 3) Run training
|
||||||
|
if wake_word_title:
|
||||||
|
cmd_str = f"{TRAIN_CMD} '{safe_word}' '{wake_word_title}'"
|
||||||
|
else:
|
||||||
|
cmd_str = f"{TRAIN_CMD} '{safe_word}'"
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["MWW_ALLOW_NO_PERSONAL"] = "true" if allow_no_personal else "false"
|
||||||
|
|
||||||
|
_append_train_log("===== Training (train_wake_word) =====")
|
||||||
|
_append_train_log(f"→ Running: {cmd_str}")
|
||||||
|
|
||||||
|
with open(log_path, "a", encoding="utf-8") as lf:
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
["bash", "-lc", cmd_str],
|
||||||
|
cwd=str(DATA_DIR),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
assert proc.stdout is not None
|
||||||
|
for line in proc.stdout:
|
||||||
|
lf.write(line)
|
||||||
|
lf.flush()
|
||||||
|
_append_train_log(line)
|
||||||
|
|
||||||
|
rc = proc.wait()
|
||||||
|
|
||||||
|
_append_train_log(f"✓ Training finished (exit_code={rc})")
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["training"]["exit_code"] = rc
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
_append_train_log(f"✗ Training crashed: {e!r}")
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["training"]["exit_code"] = 999
|
||||||
|
|
||||||
|
finally:
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["training"]["running"] = False
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------- Routes --------------------
|
||||||
|
@app.get("/", response_class=HTMLResponse)
|
||||||
|
def index():
|
||||||
|
html_path = STATIC_DIR / "index.html"
|
||||||
|
if not html_path.exists():
|
||||||
|
return HTMLResponse(
|
||||||
|
"<h3>Missing UI</h3><p>Create <code>static/index.html</code>.</p>",
|
||||||
|
status_code=500,
|
||||||
|
)
|
||||||
|
return HTMLResponse(html_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/start_session")
|
||||||
|
def start_session(payload: Dict[str, Any]):
|
||||||
|
raw = (payload.get("phrase") or "").strip()
|
||||||
|
if not raw:
|
||||||
|
return JSONResponse({"ok": False, "error": "phrase is required"}, status_code=400)
|
||||||
|
|
||||||
|
safe = safe_name(raw)
|
||||||
|
|
||||||
|
speakers_total = int(payload.get("speakers_total") or SPEAKERS_TOTAL_DEFAULT)
|
||||||
|
takes_per_speaker = int(payload.get("takes_per_speaker") or TAKES_PER_SPEAKER_DEFAULT)
|
||||||
|
|
||||||
|
speakers_total = max(1, min(10, speakers_total))
|
||||||
|
takes_per_speaker = max(1, min(50, takes_per_speaker))
|
||||||
|
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["raw_phrase"] = raw
|
||||||
|
STATE["safe_word"] = safe
|
||||||
|
STATE["speakers_total"] = speakers_total
|
||||||
|
STATE["takes_per_speaker"] = takes_per_speaker
|
||||||
|
STATE["takes_received"] = 0
|
||||||
|
STATE["takes"] = []
|
||||||
|
# do not interrupt training if running
|
||||||
|
|
||||||
|
_reset_personal_samples_dir()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"raw_phrase": raw,
|
||||||
|
"safe_word": safe,
|
||||||
|
"speakers_total": speakers_total,
|
||||||
|
"takes_per_speaker": takes_per_speaker,
|
||||||
|
"takes_total": speakers_total * takes_per_speaker,
|
||||||
|
"personal_dir": str(PERSONAL_DIR),
|
||||||
|
"data_dir": str(DATA_DIR),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/session")
|
||||||
|
def get_session():
|
||||||
|
with STATE_LOCK:
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"raw_phrase": STATE["raw_phrase"],
|
||||||
|
"safe_word": STATE["safe_word"],
|
||||||
|
"speakers_total": STATE["speakers_total"],
|
||||||
|
"takes_per_speaker": STATE["takes_per_speaker"],
|
||||||
|
"takes_received": STATE["takes_received"],
|
||||||
|
"takes": list(STATE["takes"]),
|
||||||
|
"training": dict(STATE["training"]),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/upload_take")
|
||||||
|
async def upload_take(
|
||||||
|
speaker_index: int = Form(...),
|
||||||
|
take_index: int = Form(...),
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
):
|
||||||
|
with STATE_LOCK:
|
||||||
|
safe_word = STATE["safe_word"]
|
||||||
|
speakers_total = int(STATE["speakers_total"])
|
||||||
|
takes_per_speaker = int(STATE["takes_per_speaker"])
|
||||||
|
|
||||||
|
if not safe_word:
|
||||||
|
return JSONResponse({"ok": False, "error": "No active session. Call /api/start_session first."}, status_code=400)
|
||||||
|
|
||||||
|
if speaker_index < 1 or speaker_index > speakers_total:
|
||||||
|
return JSONResponse({"ok": False, "error": f"speaker_index must be 1..{speakers_total}"}, status_code=400)
|
||||||
|
|
||||||
|
if take_index < 1 or take_index > takes_per_speaker:
|
||||||
|
return JSONResponse({"ok": False, "error": f"take_index must be 1..{takes_per_speaker}"}, status_code=400)
|
||||||
|
|
||||||
|
PERSONAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
out_name = f"speaker{speaker_index:02d}_take{take_index:02d}.wav"
|
||||||
|
out_path = PERSONAL_DIR / out_name
|
||||||
|
|
||||||
|
data = await file.read()
|
||||||
|
if not data or len(data) < 44:
|
||||||
|
return JSONResponse({"ok": False, "error": "Empty/invalid file"}, status_code=400)
|
||||||
|
|
||||||
|
out_path.write_bytes(data)
|
||||||
|
|
||||||
|
with STATE_LOCK:
|
||||||
|
if out_name not in STATE["takes"]:
|
||||||
|
STATE["takes"].append(out_name)
|
||||||
|
STATE["takes_received"] = len(STATE["takes"])
|
||||||
|
|
||||||
|
return {"ok": True, "saved_as": out_name, "takes_received": STATE["takes_received"]}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/train")
|
||||||
|
def train_now(payload: Dict[str, Any] = None):
|
||||||
|
payload = payload or {}
|
||||||
|
allow_no_personal = bool(payload.get("allow_no_personal", False))
|
||||||
|
|
||||||
|
with STATE_LOCK:
|
||||||
|
safe_word = STATE["safe_word"]
|
||||||
|
takes_received = int(STATE["takes_received"])
|
||||||
|
speakers_total = int(STATE["speakers_total"])
|
||||||
|
takes_per_speaker = int(STATE["takes_per_speaker"])
|
||||||
|
training_running = bool(STATE["training"]["running"])
|
||||||
|
|
||||||
|
takes_total = speakers_total * takes_per_speaker
|
||||||
|
|
||||||
|
if training_running:
|
||||||
|
return JSONResponse({"ok": False, "error": "Training already running"}, status_code=400)
|
||||||
|
|
||||||
|
if not safe_word:
|
||||||
|
return JSONResponse({"ok": False, "error": "No active session"}, status_code=400)
|
||||||
|
|
||||||
|
min_required = max(1, min(3, takes_total))
|
||||||
|
|
||||||
|
if takes_received == 0 and not allow_no_personal:
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"ok": False,
|
||||||
|
"error": f"No personal voice samples recorded (0/{takes_total}).",
|
||||||
|
"code": "NO_PERSONAL_SAMPLES",
|
||||||
|
"message": "You can train without personal voices, or record samples first.",
|
||||||
|
"takes_total": takes_total,
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
if 0 < takes_received < min_required:
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"ok": False,
|
||||||
|
"error": f"Not enough takes yet ({takes_received}/{takes_total}).",
|
||||||
|
"code": "NOT_ENOUGH_TAKES",
|
||||||
|
"min_required": min_required,
|
||||||
|
"takes_total": takes_total,
|
||||||
|
},
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
|
||||||
|
t = threading.Thread(target=_run_training_background, args=(safe_word, allow_no_personal), daemon=True)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"started": True,
|
||||||
|
"safe_word": safe_word,
|
||||||
|
"personal_samples_used": takes_received >= min_required,
|
||||||
|
"allow_no_personal": allow_no_personal,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/train_status")
|
||||||
|
def train_status(
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
max_bytes: int = Query(65536, ge=1024, le=262144),
|
||||||
|
last_size: int = Query(0, ge=0),
|
||||||
|
last_mtime: float = Query(0.0, ge=0.0),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Stream training output from the log file on disk.
|
||||||
|
|
||||||
|
Robust to log overwrite/truncation:
|
||||||
|
- UI passes offset + last_size + last_mtime
|
||||||
|
- If file shrinks or mtime goes backwards/changes weirdly, reset offset to 0
|
||||||
|
"""
|
||||||
|
with STATE_LOCK:
|
||||||
|
tr = dict(STATE["training"])
|
||||||
|
log_path_str = tr.get("log_path")
|
||||||
|
|
||||||
|
log_text = ""
|
||||||
|
next_offset = offset
|
||||||
|
log_size = 0
|
||||||
|
log_mtime = 0.0
|
||||||
|
|
||||||
|
if log_path_str:
|
||||||
|
p = Path(log_path_str)
|
||||||
|
if p.exists():
|
||||||
|
try:
|
||||||
|
st = p.stat()
|
||||||
|
log_size = int(st.st_size)
|
||||||
|
log_mtime = float(st.st_mtime)
|
||||||
|
|
||||||
|
# Detect overwrite/truncate/reset:
|
||||||
|
# - file shrank
|
||||||
|
# - file mtime moved "backwards" (rare) or changed while size reset
|
||||||
|
# If anything indicates a reset, restart from beginning.
|
||||||
|
if (log_size < last_size) or (last_mtime and log_mtime < last_mtime):
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
# Clamp offset to current file size
|
||||||
|
if offset > log_size:
|
||||||
|
offset = log_size
|
||||||
|
|
||||||
|
# Read incrementally from the file
|
||||||
|
with p.open("rb") as f:
|
||||||
|
f.seek(offset)
|
||||||
|
chunk = f.read(max_bytes)
|
||||||
|
|
||||||
|
log_text = chunk.decode("utf-8", errors="replace")
|
||||||
|
next_offset = offset + len(chunk)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log_text = f"\n[log read error: {e!r}]\n"
|
||||||
|
next_offset = offset
|
||||||
|
|
||||||
|
tr["log_text"] = log_text
|
||||||
|
tr["next_offset"] = next_offset
|
||||||
|
tr["log_size"] = log_size
|
||||||
|
tr["log_mtime"] = log_mtime
|
||||||
|
|
||||||
|
return {"ok": True, "training": tr}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/reset_recordings")
|
||||||
|
def reset_recordings():
|
||||||
|
_reset_personal_samples_dir()
|
||||||
|
with STATE_LOCK:
|
||||||
|
STATE["takes_received"] = 0
|
||||||
|
STATE["takes"] = []
|
||||||
|
return {"ok": True}
|
||||||
@@ -1,28 +1,10 @@
|
|||||||
# --- Core training (Microwakeword) ---
|
# --- Packages needed by our scripts ---
|
||||||
|
|
||||||
numpy==1.26.4
|
numpy==1.26.4
|
||||||
scipy==1.12.0
|
scipy==1.12.0
|
||||||
librosa==0.10.2.post1
|
librosa==0.10.2.post1
|
||||||
soundfile==0.12.1
|
soundfile==0.12.1
|
||||||
soxr==0.5.0.post1
|
|
||||||
audiomentations==0.38.0
|
|
||||||
webrtcvad==2.0.10
|
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
scikit-learn==1.6.0
|
scikit-learn==1.6.0
|
||||||
numba==0.60.0
|
numba==0.63.1
|
||||||
joblib==1.4.2
|
PyYAML==6.0.3
|
||||||
pandas==2.2.3
|
|
||||||
pymicro_features @ git+https://github.com/puddly/pymicro-features@e1d3f88183e12bb8af2df9e399ea157af7393762
|
|
||||||
audio-metadata @ git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
|
|
||||||
bitstruct==8.19.0
|
|
||||||
|
|
||||||
# --- Piper sample generation ---
|
|
||||||
piper-tts>=1.2.0
|
|
||||||
piper-phonemize-cross==1.2.1
|
|
||||||
|
|
||||||
# --- Notebook / tooling ---
|
|
||||||
ipykernel==6.29.5
|
|
||||||
jupyterlab==4.3.4
|
|
||||||
ipywidgets==8.1.5
|
|
||||||
matplotlib-inline==0.1.7
|
|
||||||
rich==13.9.4
|
|
||||||
|
|||||||
64
run_recorder.sh
Normal file
64
run_recorder.sh
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOTDIR="$(dirname "$(realpath "$0")")"
|
||||||
|
|
||||||
|
# Training convention
|
||||||
|
DATA_DIR="${DATA_DIR:-/data}"
|
||||||
|
HOST="${REC_HOST:-0.0.0.0}"
|
||||||
|
PORT="${REC_PORT:-8888}"
|
||||||
|
|
||||||
|
# Keep recorder deps separate from training venv
|
||||||
|
VENV_DIR="${DATA_DIR}/.recorder-venv"
|
||||||
|
PY="${VENV_DIR}/bin/python"
|
||||||
|
PIP="${PY} -m pip"
|
||||||
|
PIN_FILE="${VENV_DIR}/.pinned_installed"
|
||||||
|
|
||||||
|
FASTAPI_VERSION="${REC_FASTAPI_VERSION:-0.115.6}"
|
||||||
|
UVICORN_VERSION="${REC_UVICORN_VERSION:-0.30.6}"
|
||||||
|
PY_MULTIPART_VERSION="${REC_PY_MULTIPART_VERSION:-0.0.9}"
|
||||||
|
|
||||||
|
echo "microWakeWord Recorder (Docker)"
|
||||||
|
echo "-> ROOTDIR: ${ROOTDIR}"
|
||||||
|
echo "-> DATA_DIR: ${DATA_DIR}"
|
||||||
|
echo "-> URL: http://localhost:${PORT}/"
|
||||||
|
|
||||||
|
mkdir -p "${DATA_DIR}"
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Recorder venv (separate)
|
||||||
|
# -----------------------------
|
||||||
|
if [[ ! -x "${PY}" ]]; then
|
||||||
|
echo "Creating recorder venv: ${VENV_DIR}"
|
||||||
|
python3 -m venv "${VENV_DIR}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${VENV_DIR}/bin/activate"
|
||||||
|
|
||||||
|
if [[ ! -f "${PIN_FILE}" ]]; then
|
||||||
|
echo "Installing pinned recorder deps"
|
||||||
|
${PIP} install -U pip setuptools wheel
|
||||||
|
${PIP} install \
|
||||||
|
"fastapi==${FASTAPI_VERSION}" \
|
||||||
|
"uvicorn[standard]==${UVICORN_VERSION}" \
|
||||||
|
"python-multipart==${PY_MULTIPART_VERSION}"
|
||||||
|
touch "${PIN_FILE}"
|
||||||
|
else
|
||||||
|
echo "Reusing existing recorder venv (no upgrades)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Recorder server env
|
||||||
|
# -----------------------------
|
||||||
|
export DATA_DIR="${DATA_DIR}"
|
||||||
|
export STATIC_DIR="${ROOTDIR}/static"
|
||||||
|
export PERSONAL_DIR="${DATA_DIR}/personal_samples"
|
||||||
|
|
||||||
|
# IMPORTANT: leave training venv creation to /api/train inside recorder_server.py
|
||||||
|
# but still set TRAIN_CMD so the server knows how to invoke training once ready
|
||||||
|
export TRAIN_CMD="source '${DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir='${DATA_DIR}'"
|
||||||
|
|
||||||
|
echo "Launching uvicorn on ${HOST}:${PORT}"
|
||||||
|
cd "${ROOTDIR}"
|
||||||
|
exec "${VENV_DIR}/bin/uvicorn" recorder_server:app --host "${HOST}" --port "${PORT}"
|
||||||
23
startup.sh
23
startup.sh
@@ -1,23 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
: "${NB_UID:=0}"
|
|
||||||
: "${NB_GID:=0}"
|
|
||||||
umask 002
|
|
||||||
|
|
||||||
NOTEBOOK_SRC="/root/microWakeWord_training_notebook.ipynb"
|
|
||||||
NOTEBOOK_DST="/data/microWakeWord_training_notebook.ipynb"
|
|
||||||
|
|
||||||
mkdir -p /data /data/generated_samples /data/personal_samples
|
|
||||||
|
|
||||||
if [[ ! -f "$NOTEBOOK_DST" ]]; then
|
|
||||||
echo "No training notebook found in /data; copying default…"
|
|
||||||
cp -n "$NOTEBOOK_SRC" "$NOTEBOOK_DST"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Try to align ownership for convenience (ignore errors if not permitted)
|
|
||||||
if [[ "$NB_UID" != "0" || "$NB_GID" != "0" ]]; then
|
|
||||||
chown -R "$NB_UID:$NB_GID" /data || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
exec "$@"
|
|
||||||
782
static/index.html
Normal file
782
static/index.html
Normal file
@@ -0,0 +1,782 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>microWakeWord Recorder</title>
|
||||||
|
<style>
|
||||||
|
:root{
|
||||||
|
--bg: #070709;
|
||||||
|
--panel: rgba(18, 18, 22, 0.78);
|
||||||
|
--panel2: rgba(24, 24, 30, 0.86);
|
||||||
|
--text: #e9e9ee;
|
||||||
|
--muted: #a2a2ad;
|
||||||
|
--line: rgba(255,255,255,0.10);
|
||||||
|
--orange: #ff8a2a;
|
||||||
|
--orange2:#ffb066;
|
||||||
|
--ok:#38d39f;
|
||||||
|
--warn:#ffb020;
|
||||||
|
--err:#ff4a4a;
|
||||||
|
--shadow: 0 18px 50px rgba(0,0,0,0.45);
|
||||||
|
--radius: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
html, body { height: 100%; }
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
color: var(--text);
|
||||||
|
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
|
||||||
|
background:
|
||||||
|
radial-gradient(900px 500px at 12% 6%, rgba(255, 138, 42, 0.12), transparent 55%),
|
||||||
|
radial-gradient(700px 420px at 80% 14%, rgba(255, 176, 102, 0.09), transparent 60%),
|
||||||
|
radial-gradient(800px 600px at 50% 100%, rgba(255, 138, 42, 0.06), transparent 55%),
|
||||||
|
linear-gradient(180deg, #050506 0%, #09090d 100%);
|
||||||
|
}
|
||||||
|
|
||||||
|
.wrap { max-width: 940px; margin: 0 auto; padding: 26px 18px 42px; }
|
||||||
|
|
||||||
|
h2 { margin: 0 0 8px; font-size: 22px; letter-spacing: 0.2px; }
|
||||||
|
p { margin: 0 0 14px; color: var(--muted); line-height: 1.45; }
|
||||||
|
|
||||||
|
.topbar {
|
||||||
|
display:flex; align-items:center; justify-content:space-between;
|
||||||
|
gap: 12px; margin-bottom: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.brand { display:flex; align-items:center; gap:10px; }
|
||||||
|
.logo {
|
||||||
|
width: 38px; height: 38px; border-radius: 12px;
|
||||||
|
background:
|
||||||
|
radial-gradient(circle at 30% 30%, rgba(255,176,102,0.55), rgba(255,138,42,0.25) 45%, rgba(0,0,0,0) 72%),
|
||||||
|
linear-gradient(180deg, rgba(255,138,42,0.22), rgba(255,138,42,0.06));
|
||||||
|
border: 1px solid rgba(255,138,42,0.30);
|
||||||
|
box-shadow: 0 10px 28px rgba(255,138,42,0.08);
|
||||||
|
}
|
||||||
|
|
||||||
|
.row { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; }
|
||||||
|
|
||||||
|
.card {
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
background: linear-gradient(180deg, var(--panel), var(--panel2));
|
||||||
|
border-radius: var(--radius);
|
||||||
|
padding: 16px;
|
||||||
|
margin-top: 14px;
|
||||||
|
box-shadow: var(--shadow);
|
||||||
|
backdrop-filter: blur(8px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.muted { color: var(--muted); }
|
||||||
|
|
||||||
|
input[type="text"], input[type="number"]{
|
||||||
|
padding: 11px 12px;
|
||||||
|
font-size: 15px;
|
||||||
|
border-radius: 12px;
|
||||||
|
border: 1px solid rgba(255,255,255,0.12);
|
||||||
|
background: rgba(0,0,0,0.35);
|
||||||
|
color: var(--text);
|
||||||
|
outline: none;
|
||||||
|
}
|
||||||
|
input[type="text"] { width: 420px; max-width: 100%; }
|
||||||
|
input[type="number"] { width: 120px; }
|
||||||
|
input::placeholder { color: rgba(233,233,238,0.35); }
|
||||||
|
|
||||||
|
button {
|
||||||
|
padding: 10px 14px;
|
||||||
|
font-size: 13px;
|
||||||
|
cursor: pointer;
|
||||||
|
border-radius: 12px;
|
||||||
|
border: 1px solid rgba(255,255,255,0.14);
|
||||||
|
background: rgba(255,255,255,0.06);
|
||||||
|
color: var(--text);
|
||||||
|
transition: transform 0.04s ease, border-color .15s ease, background .15s ease;
|
||||||
|
}
|
||||||
|
button:hover { border-color: rgba(255,138,42,0.35); background: rgba(255,255,255,0.08); }
|
||||||
|
button:active { transform: translateY(1px); }
|
||||||
|
button:disabled { opacity: 0.45; cursor: not-allowed; }
|
||||||
|
|
||||||
|
.primary {
|
||||||
|
border-color: rgba(255,138,42,0.40);
|
||||||
|
background: linear-gradient(180deg, rgba(255,138,42,0.24), rgba(255,138,42,0.12));
|
||||||
|
}
|
||||||
|
.primary:hover { border-color: rgba(255,138,42,0.65); }
|
||||||
|
|
||||||
|
.pill {
|
||||||
|
display:inline-block;
|
||||||
|
padding: 4px 10px;
|
||||||
|
border-radius: 999px;
|
||||||
|
background: rgba(255,255,255,0.07);
|
||||||
|
border: 1px solid rgba(255,255,255,0.10);
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
.pill.ok { color: var(--ok); border-color: rgba(56,211,159,0.25); background: rgba(56,211,159,0.08); }
|
||||||
|
.pill.warn { color: var(--warn); border-color: rgba(255,176,32,0.25); background: rgba(255,176,32,0.08); }
|
||||||
|
.pill.err { color: var(--err); border-color: rgba(255,74,74,0.25); background: rgba(255,74,74,0.08); }
|
||||||
|
|
||||||
|
details { margin-top: 10px; }
|
||||||
|
summary { cursor: pointer; color: var(--orange2); }
|
||||||
|
summary:hover { color: var(--orange); }
|
||||||
|
|
||||||
|
label { display:flex; gap:10px; align-items:center; }
|
||||||
|
input[type="range"] { width: 240px; }
|
||||||
|
|
||||||
|
.meter {
|
||||||
|
height: 10px;
|
||||||
|
background: rgba(255,255,255,0.08);
|
||||||
|
border-radius: 999px;
|
||||||
|
overflow: hidden;
|
||||||
|
width: 280px;
|
||||||
|
border: 1px solid rgba(255,255,255,0.10);
|
||||||
|
}
|
||||||
|
.meter > div {
|
||||||
|
height: 10px;
|
||||||
|
width: 0%;
|
||||||
|
background: linear-gradient(90deg, rgba(255,138,42,0.55), rgba(255,176,102,0.85));
|
||||||
|
}
|
||||||
|
|
||||||
|
pre {
|
||||||
|
background: rgba(0,0,0,0.55);
|
||||||
|
color: #e6e6ea;
|
||||||
|
padding: 12px;
|
||||||
|
border-radius: 14px;
|
||||||
|
overflow: auto;
|
||||||
|
max-height: 300px;
|
||||||
|
border: 1px solid rgba(255,255,255,0.10);
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-break: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
|
.big { font-size: 16px; }
|
||||||
|
|
||||||
|
.divider {
|
||||||
|
height: 1px;
|
||||||
|
width: 100%;
|
||||||
|
background: rgba(255,255,255,0.10);
|
||||||
|
margin: 12px 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div class="wrap">
|
||||||
|
<div class="topbar">
|
||||||
|
<div class="brand">
|
||||||
|
<div class="logo"></div>
|
||||||
|
<div>
|
||||||
|
<h2>🎙️ microWakeWord Personal Recorder</h2>
|
||||||
|
<p class="muted">Enter a wake word, test TTS pronunciation, then record takes. Recording starts when you speak and stops after silence.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="row">
|
||||||
|
<input id="phrase" type="text" placeholder='e.g. "tater totterson"' />
|
||||||
|
<button id="startSessionBtn" class="primary">Start session</button>
|
||||||
|
<button id="ttsBtn" disabled>🔊 Test TTS</button>
|
||||||
|
<span id="sessionPill" class="pill">No session</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row" style="margin-top:10px;">
|
||||||
|
<label class="muted">Speakers
|
||||||
|
<input id="speakersTotal" type="number" min="1" max="10" value="1" />
|
||||||
|
</label>
|
||||||
|
<label class="muted">Takes / speaker
|
||||||
|
<input id="takesPerSpeaker" type="number" min="1" max="50" value="10" />
|
||||||
|
</label>
|
||||||
|
<span id="speakerPill" class="pill">Speaker: -</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Advanced (if it’s too sensitive / not sensitive enough)</summary>
|
||||||
|
<div style="margin-top:10px;">
|
||||||
|
<label>
|
||||||
|
Start sensitivity
|
||||||
|
<input id="startThresh" type="range" min="0.005" max="0.08" step="0.001" value="0.02" />
|
||||||
|
<span id="startThreshVal" class="muted"></span>
|
||||||
|
</label>
|
||||||
|
<label>
|
||||||
|
Silence stop (ms)
|
||||||
|
<input id="silenceMs" type="range" min="300" max="2000" step="50" value="900" />
|
||||||
|
<span id="silenceMsVal" class="muted"></span>
|
||||||
|
</label>
|
||||||
|
<label>
|
||||||
|
Min take length (ms)
|
||||||
|
<input id="minTakeMs" type="range" min="300" max="2000" step="50" value="650" />
|
||||||
|
<span id="minTakeMsVal" class="muted"></span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="row">
|
||||||
|
<button id="beginBtn" disabled class="primary">🎬 Begin recording</button>
|
||||||
|
<button id="resetBtn" disabled>🧹 Reset recordings</button>
|
||||||
|
<button id="trainBtn" disabled>🧠 Start training</button>
|
||||||
|
<span id="status" class="pill">Idle</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="margin-top:12px;" class="row">
|
||||||
|
<div class="meter"><div id="meterFill"></div></div>
|
||||||
|
<span class="muted" id="meterText">Mic level</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="divider"></div>
|
||||||
|
|
||||||
|
<p class="big">
|
||||||
|
Speaker: <b id="speakerNum">-</b> / <b id="speakerTotal">-</b>
|
||||||
|
<span id="speakerState" class="pill">Waiting</span>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p class="big">
|
||||||
|
Take: <b id="takeNum">0</b> / <b id="takeTotal">10</b>
|
||||||
|
<span id="takeState" class="pill">Not recording</span>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div id="takesList" class="muted"></div>
|
||||||
|
|
||||||
|
<h4 style="margin-top: 18px; margin-bottom: 10px;">Training log</h4>
|
||||||
|
<pre id="trainLog">(no training started)</pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const $ = (id) => document.getElementById(id);
|
||||||
|
|
||||||
|
function setPill(el, text, cls) {
|
||||||
|
el.className = "pill " + (cls || "");
|
||||||
|
el.textContent = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function api(path, opts) {
|
||||||
|
const res = await fetch(path, opts);
|
||||||
|
const ct = res.headers.get("content-type") || "";
|
||||||
|
const data = ct.includes("application/json") ? await res.json() : await res.text();
|
||||||
|
if (!res.ok) {
|
||||||
|
const err = (typeof data === "string") ? { error: data } : (data || {});
|
||||||
|
const msg = err.error || err.message || JSON.stringify(err);
|
||||||
|
const e = new Error(msg);
|
||||||
|
e.details = err;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- log auto-scroll (sticky to bottom) --------------------
|
||||||
|
function isNearBottom(el, px = 40) {
|
||||||
|
return (el.scrollHeight - el.scrollTop - el.clientHeight) <= px;
|
||||||
|
}
|
||||||
|
|
||||||
|
function appendLogChunkAutoScroll(el, chunk) {
|
||||||
|
if (!chunk) return;
|
||||||
|
const stick = isNearBottom(el);
|
||||||
|
el.textContent += chunk;
|
||||||
|
if (stick) el.scrollTop = el.scrollHeight;
|
||||||
|
}
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
let session = null;
|
||||||
|
let isRunning = false;
|
||||||
|
|
||||||
|
let stream = null;
|
||||||
|
let audioCtx = null;
|
||||||
|
let analyser = null;
|
||||||
|
let source = null;
|
||||||
|
|
||||||
|
let capturing = false;
|
||||||
|
let startedAt = 0;
|
||||||
|
let silenceStart = null;
|
||||||
|
let floatChunks = [];
|
||||||
|
let frameSize = 2048;
|
||||||
|
|
||||||
|
let currentSpeaker = 1;
|
||||||
|
let speakersTotal = 1;
|
||||||
|
|
||||||
|
let currentTake = 0;
|
||||||
|
let takesPerSpeaker = 10;
|
||||||
|
|
||||||
|
// --- incremental log streaming state ---
|
||||||
|
// Polls /api/train_status?offset=<N> and appends training.log_text (reads /data/recorder_training.log)
|
||||||
|
let trainOffset = 0;
|
||||||
|
let trainingPollRunning = false;
|
||||||
|
let trainingPollAbort = false;
|
||||||
|
|
||||||
|
function startThreshold() { return parseFloat($("startThresh").value); }
|
||||||
|
function silenceStopMs() { return parseInt($("silenceMs").value, 10); }
|
||||||
|
function minTakeMs() { return parseInt($("minTakeMs").value, 10); }
|
||||||
|
|
||||||
|
function updateAdvancedLabels() {
|
||||||
|
$("startThreshVal").textContent = startThreshold().toFixed(3);
|
||||||
|
$("silenceMsVal").textContent = silenceStopMs() + "ms";
|
||||||
|
$("minTakeMsVal").textContent = minTakeMs() + "ms";
|
||||||
|
}
|
||||||
|
["startThresh","silenceMs","minTakeMs"].forEach(id => $(id).addEventListener("input", updateAdvancedLabels));
|
||||||
|
updateAdvancedLabels();
|
||||||
|
|
||||||
|
function refreshUI() {
|
||||||
|
$("speakerNum").textContent = String(currentSpeaker);
|
||||||
|
$("speakerTotal").textContent = String(speakersTotal);
|
||||||
|
$("takeNum").textContent = String(currentTake);
|
||||||
|
$("takeTotal").textContent = String(takesPerSpeaker);
|
||||||
|
setPill($("speakerPill"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- mic lifecycle --------------------
|
||||||
|
async function ensureMic() {
|
||||||
|
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||||
|
throw new Error("Microphone not available here. Use https:// (or http://localhost) to record.");
|
||||||
|
}
|
||||||
|
if (stream) return;
|
||||||
|
stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
|
||||||
|
audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||||
|
analyser = audioCtx.createAnalyser();
|
||||||
|
analyser.fftSize = 2048;
|
||||||
|
source = audioCtx.createMediaStreamSource(stream);
|
||||||
|
source.connect(analyser);
|
||||||
|
requestAnimationFrame(meterLoop);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function stopMicNow() {
|
||||||
|
isRunning = false;
|
||||||
|
capturing = false;
|
||||||
|
|
||||||
|
const proc = window.__mw_proc;
|
||||||
|
if (proc) {
|
||||||
|
try { proc.disconnect(); } catch {}
|
||||||
|
try { source && source.disconnect(proc); } catch {}
|
||||||
|
window.__mw_proc = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stream) {
|
||||||
|
try { stream.getTracks().forEach(t => t.stop()); } catch {}
|
||||||
|
stream = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (audioCtx) {
|
||||||
|
try { await audioCtx.close(); } catch {}
|
||||||
|
audioCtx = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
analyser = null;
|
||||||
|
source = null;
|
||||||
|
|
||||||
|
$("meterFill").style.width = "0%";
|
||||||
|
$("meterText").textContent = "Mic stopped";
|
||||||
|
}
|
||||||
|
|
||||||
|
function meterLoop() {
|
||||||
|
if (!analyser) {
|
||||||
|
requestAnimationFrame(meterLoop);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = new Uint8Array(analyser.fftSize);
|
||||||
|
analyser.getByteTimeDomainData(data);
|
||||||
|
|
||||||
|
let sumSq = 0;
|
||||||
|
for (let i=0;i<data.length;i++){
|
||||||
|
const v = (data[i] - 128) / 128;
|
||||||
|
sumSq += v*v;
|
||||||
|
}
|
||||||
|
const rms = Math.sqrt(sumSq / data.length);
|
||||||
|
const pct = Math.min(100, Math.max(0, rms * 600));
|
||||||
|
$("meterFill").style.width = pct + "%";
|
||||||
|
$("meterText").textContent = `Mic level (rms=${rms.toFixed(3)})`;
|
||||||
|
|
||||||
|
if (isRunning) recorderTick(rms);
|
||||||
|
|
||||||
|
requestAnimationFrame(meterLoop);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- recording state machine --------------------
|
||||||
|
function recorderTick(rms) {
|
||||||
|
const now = performance.now();
|
||||||
|
|
||||||
|
if (!capturing) {
|
||||||
|
if (rms >= startThreshold()) startCapture();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rms < startThreshold() * 0.65) {
|
||||||
|
if (silenceStart === null) silenceStart = now;
|
||||||
|
const silentFor = now - silenceStart;
|
||||||
|
if (silentFor >= silenceStopMs()) {
|
||||||
|
const dur = now - startedAt;
|
||||||
|
if (dur >= minTakeMs()) stopCaptureAndUpload();
|
||||||
|
else silenceStart = now;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
silenceStart = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startCapture() {
|
||||||
|
capturing = true;
|
||||||
|
startedAt = performance.now();
|
||||||
|
silenceStart = null;
|
||||||
|
floatChunks = [];
|
||||||
|
|
||||||
|
setPill($("takeState"), "Recording…", "warn");
|
||||||
|
|
||||||
|
const proc = audioCtx.createScriptProcessor(frameSize, 1, 1);
|
||||||
|
source.connect(proc);
|
||||||
|
proc.connect(audioCtx.destination);
|
||||||
|
|
||||||
|
proc.onaudioprocess = (ev) => {
|
||||||
|
if (!capturing) return;
|
||||||
|
const chan = ev.inputBuffer.getChannelData(0);
|
||||||
|
floatChunks.push(new Float32Array(chan));
|
||||||
|
};
|
||||||
|
|
||||||
|
window.__mw_proc = proc;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function stopCaptureAndUpload() {
|
||||||
|
capturing = false;
|
||||||
|
setPill($("takeState"), "Processing…");
|
||||||
|
|
||||||
|
const proc = window.__mw_proc;
|
||||||
|
if (proc) {
|
||||||
|
try { proc.disconnect(); } catch {}
|
||||||
|
try { source.disconnect(proc); } catch {}
|
||||||
|
window.__mw_proc = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentTake += 1;
|
||||||
|
refreshUI();
|
||||||
|
|
||||||
|
let totalLen = 0;
|
||||||
|
for (const c of floatChunks) totalLen += c.length;
|
||||||
|
const merged = new Float32Array(totalLen);
|
||||||
|
let off = 0;
|
||||||
|
for (const c of floatChunks) { merged.set(c, off); off += c.length; }
|
||||||
|
|
||||||
|
const wavBlob = await floatToWav16kMono(merged, audioCtx.sampleRate);
|
||||||
|
|
||||||
|
try {
|
||||||
|
setPill($("status"), `Uploading speaker ${currentSpeaker} take ${currentTake}…`, "warn");
|
||||||
|
|
||||||
|
const fd = new FormData();
|
||||||
|
fd.append("speaker_index", String(currentSpeaker));
|
||||||
|
fd.append("take_index", String(currentTake));
|
||||||
|
fd.append("file", wavBlob, `take_${String(currentTake).padStart(2,"0")}.wav`);
|
||||||
|
|
||||||
|
await api("/api/upload_take", { method:"POST", body: fd });
|
||||||
|
|
||||||
|
$("takesList").textContent = `Saved ${currentTake}/${takesPerSpeaker} takes for speaker ${currentSpeaker}/${speakersTotal}`;
|
||||||
|
setPill($("status"), `Saved speaker ${currentSpeaker} take ${currentTake}/${takesPerSpeaker}`, "ok");
|
||||||
|
|
||||||
|
if (currentTake >= takesPerSpeaker) {
|
||||||
|
if (currentSpeaker >= speakersTotal) {
|
||||||
|
setPill($("takeState"), "Done", "ok");
|
||||||
|
setPill($("speakerState"), "All speakers done ✅", "ok");
|
||||||
|
setPill($("status"), "All takes recorded ✅", "ok");
|
||||||
|
|
||||||
|
await stopMicNow();
|
||||||
|
await autoStartTraining();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentSpeaker += 1;
|
||||||
|
currentTake = 0;
|
||||||
|
refreshUI();
|
||||||
|
|
||||||
|
setPill($("speakerState"), `Speaker ${currentSpeaker - 1} complete ✅`, "ok");
|
||||||
|
setPill($("takeState"), "Paused", "warn");
|
||||||
|
setPill($("status"), `Ready for speaker ${currentSpeaker}. Click Begin recording.`, "warn");
|
||||||
|
|
||||||
|
isRunning = false;
|
||||||
|
$("beginBtn").disabled = false;
|
||||||
|
|
||||||
|
await stopMicNow();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setPill($("speakerState"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||||
|
setPill($("takeState"), "Listening…", "ok");
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
setPill($("status"), "Upload failed", "err");
|
||||||
|
setPill($("takeState"), "Error", "err");
|
||||||
|
isRunning = false;
|
||||||
|
$("beginBtn").disabled = false;
|
||||||
|
alert("Upload failed: " + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- WAV encoding helpers --------------------
|
||||||
|
async function floatToWav16kMono(float32, srcRate) {
|
||||||
|
const buf = audioCtx.createBuffer(1, float32.length, srcRate);
|
||||||
|
buf.copyToChannel(float32, 0);
|
||||||
|
|
||||||
|
const targetRate = 16000;
|
||||||
|
const targetLen = Math.max(1, Math.round(float32.length * targetRate / srcRate));
|
||||||
|
const offline = new OfflineAudioContext(1, targetLen, targetRate);
|
||||||
|
|
||||||
|
const src = offline.createBufferSource();
|
||||||
|
src.buffer = buf;
|
||||||
|
src.connect(offline.destination);
|
||||||
|
src.start(0);
|
||||||
|
|
||||||
|
const rendered = await offline.startRendering();
|
||||||
|
const data = rendered.getChannelData(0);
|
||||||
|
|
||||||
|
const wav = encodeWavPCM16(data, targetRate);
|
||||||
|
return new Blob([wav], { type: "audio/wav" });
|
||||||
|
}
|
||||||
|
|
||||||
|
function encodeWavPCM16(float32, sampleRate) {
|
||||||
|
const numSamples = float32.length;
|
||||||
|
const buffer = new ArrayBuffer(44 + numSamples * 2);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
function writeString(offset, str) {
|
||||||
|
for (let i=0;i<str.length;i++) view.setUint8(offset+i, str.charCodeAt(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
writeString(0, "RIFF");
|
||||||
|
view.setUint32(4, 36 + numSamples * 2, true);
|
||||||
|
writeString(8, "WAVE");
|
||||||
|
|
||||||
|
writeString(12, "fmt ");
|
||||||
|
view.setUint32(16, 16, true);
|
||||||
|
view.setUint16(20, 1, true);
|
||||||
|
view.setUint16(22, 1, true);
|
||||||
|
view.setUint32(24, sampleRate, true);
|
||||||
|
view.setUint32(28, sampleRate * 2, true);
|
||||||
|
view.setUint16(32, 2, true);
|
||||||
|
view.setUint16(34, 16, true);
|
||||||
|
|
||||||
|
writeString(36, "data");
|
||||||
|
view.setUint32(40, numSamples * 2, true);
|
||||||
|
|
||||||
|
let offset = 44;
|
||||||
|
for (let i=0;i<numSamples;i++) {
|
||||||
|
let s = Math.max(-1, Math.min(1, float32[i]));
|
||||||
|
const v = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||||
|
view.setInt16(offset, v, true);
|
||||||
|
offset += 2;
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- training (manual + auto) --------------------
|
||||||
|
async function startTrainingWithPrompt(auto=false) {
|
||||||
|
const sess = await api("/api/session", { method: "GET" });
|
||||||
|
const takesReceived = sess.takes_received || 0;
|
||||||
|
const total = (sess.speakers_total || 1) * (sess.takes_per_speaker || 10);
|
||||||
|
|
||||||
|
let allowNoPersonal = false;
|
||||||
|
|
||||||
|
if (takesReceived === 0) {
|
||||||
|
const ok = confirm(
|
||||||
|
`No personal voice samples recorded (0/${total}).\n\nTrain anyway WITHOUT personal voices?`
|
||||||
|
);
|
||||||
|
if (!ok) return;
|
||||||
|
allowNoPersonal = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// lock UI immediately
|
||||||
|
$("trainBtn").disabled = true;
|
||||||
|
$("beginBtn").disabled = true;
|
||||||
|
$("resetBtn").disabled = true;
|
||||||
|
|
||||||
|
setPill($("status"), auto ? "Auto-starting training…" : "Preparing training environment…", "warn");
|
||||||
|
|
||||||
|
// reset streaming log state (we show recorder_training.log from the start of this run)
|
||||||
|
trainOffset = 0;
|
||||||
|
trainingPollAbort = false;
|
||||||
|
|
||||||
|
const logEl = $("trainLog");
|
||||||
|
logEl.textContent = "(preparing…)\n";
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Kick off training first
|
||||||
|
await api("/api/train", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ allow_no_personal: allowNoPersonal })
|
||||||
|
});
|
||||||
|
|
||||||
|
// Only start polling AFTER training was successfully kicked off
|
||||||
|
if (!trainingPollRunning) {
|
||||||
|
trainingPollRunning = true;
|
||||||
|
pollTrainingIncremental();
|
||||||
|
}
|
||||||
|
|
||||||
|
setPill($("status"), "Training running…", "warn");
|
||||||
|
} catch (e) {
|
||||||
|
$("trainBtn").disabled = false;
|
||||||
|
$("resetBtn").disabled = false;
|
||||||
|
$("beginBtn").disabled = false;
|
||||||
|
trainingPollAbort = true;
|
||||||
|
trainingPollRunning = false;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function autoStartTraining() {
|
||||||
|
try {
|
||||||
|
await startTrainingWithPrompt(true);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
setPill($("status"), "Auto-train failed", "err");
|
||||||
|
alert("Auto-start training failed: " + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$("trainBtn").addEventListener("click", async () => {
|
||||||
|
try {
|
||||||
|
await startTrainingWithPrompt(false);
|
||||||
|
} catch (e) {
|
||||||
|
alert("Train failed: " + e.message);
|
||||||
|
setPill($("status"), "Train failed", "err");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Polls /api/train_status?offset=<trainOffset>
|
||||||
|
// Expects JSON: { ok: true, training: { running, exit_code, log_text, next_offset } }
|
||||||
|
async function pollTrainingIncremental() {
|
||||||
|
const logEl = $("trainLog");
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
if (trainingPollAbort) {
|
||||||
|
trainingPollRunning = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const st = await api(`/api/train_status?offset=${trainOffset}`, { method:"GET" });
|
||||||
|
const tr = st.training || {};
|
||||||
|
|
||||||
|
const chunk = tr.log_text || "";
|
||||||
|
const next = (typeof tr.next_offset === "number") ? tr.next_offset : trainOffset;
|
||||||
|
|
||||||
|
// If we got real output, replace the "(preparing…)" placeholder
|
||||||
|
if (chunk && logEl.textContent.startsWith("(preparing…)")) {
|
||||||
|
logEl.textContent = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk) appendLogChunkAutoScroll(logEl, chunk);
|
||||||
|
|
||||||
|
trainOffset = next;
|
||||||
|
|
||||||
|
// Stop polling only when training has ended and exit_code is set
|
||||||
|
const exitCodeIsSet = (tr.exit_code !== null && tr.exit_code !== undefined);
|
||||||
|
|
||||||
|
if (!tr.running && exitCodeIsSet) {
|
||||||
|
$("trainBtn").disabled = false;
|
||||||
|
$("resetBtn").disabled = false;
|
||||||
|
$("beginBtn").disabled = false;
|
||||||
|
|
||||||
|
if (tr.exit_code === 0) setPill($("status"), "Training finished ✅", "ok");
|
||||||
|
else setPill($("status"), `Training ended (exit=${tr.exit_code})`, "err");
|
||||||
|
|
||||||
|
trainingPollRunning = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// ignore transient polling errors
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise(r => setTimeout(r, 1500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------- session + UI wiring --------------------
|
||||||
|
$("ttsBtn").addEventListener("click", () => {
|
||||||
|
const phrase = ($("phrase").value || "").trim();
|
||||||
|
if (!phrase) return;
|
||||||
|
const u = new SpeechSynthesisUtterance(phrase);
|
||||||
|
speechSynthesis.cancel();
|
||||||
|
speechSynthesis.speak(u);
|
||||||
|
});
|
||||||
|
|
||||||
|
$("startSessionBtn").addEventListener("click", async () => {
|
||||||
|
const phrase = ($("phrase").value || "").trim();
|
||||||
|
if (!phrase) { alert("Enter a wake word phrase first."); return; }
|
||||||
|
|
||||||
|
speakersTotal = parseInt($("speakersTotal").value || "1", 10);
|
||||||
|
takesPerSpeaker = parseInt($("takesPerSpeaker").value || "10", 10);
|
||||||
|
|
||||||
|
try {
|
||||||
|
setPill($("sessionPill"), "Starting…", "warn");
|
||||||
|
const data = await api("/api/start_session", {
|
||||||
|
method: "POST",
|
||||||
|
headers: {"Content-Type":"application/json"},
|
||||||
|
body: JSON.stringify({ phrase, speakers_total: speakersTotal, takes_per_speaker: takesPerSpeaker })
|
||||||
|
});
|
||||||
|
|
||||||
|
session = data;
|
||||||
|
|
||||||
|
currentSpeaker = 1;
|
||||||
|
currentTake = 0;
|
||||||
|
|
||||||
|
$("takesList").textContent = "";
|
||||||
|
$("trainLog").textContent = "(no training started)";
|
||||||
|
|
||||||
|
trainOffset = 0;
|
||||||
|
|
||||||
|
// If a previous training poll loop is running, ask it to stop
|
||||||
|
trainingPollAbort = true;
|
||||||
|
trainingPollRunning = false;
|
||||||
|
|
||||||
|
refreshUI();
|
||||||
|
|
||||||
|
await stopMicNow();
|
||||||
|
|
||||||
|
setPill($("sessionPill"), `Session: ${data.safe_word}`, "ok");
|
||||||
|
$("beginBtn").disabled = false;
|
||||||
|
$("resetBtn").disabled = false;
|
||||||
|
$("trainBtn").disabled = false;
|
||||||
|
$("ttsBtn").disabled = false;
|
||||||
|
|
||||||
|
setPill($("status"), "Ready", "ok");
|
||||||
|
setPill($("speakerState"), "Waiting");
|
||||||
|
setPill($("takeState"), "Not recording");
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
setPill($("sessionPill"), "Session failed", "err");
|
||||||
|
alert("Start session failed: " + e.message);
|
||||||
|
} finally {
|
||||||
|
trainingPollAbort = false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
$("resetBtn").addEventListener("click", async () => {
|
||||||
|
try {
|
||||||
|
await api("/api/reset_recordings", {method:"POST"});
|
||||||
|
currentSpeaker = 1;
|
||||||
|
currentTake = 0;
|
||||||
|
$("takesList").textContent = "";
|
||||||
|
refreshUI();
|
||||||
|
setPill($("status"), "Recordings reset", "ok");
|
||||||
|
} catch (e) {
|
||||||
|
alert("Reset failed: " + e.message);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
$("beginBtn").addEventListener("click", async () => {
|
||||||
|
if (!session) { alert("Start a session first."); return; }
|
||||||
|
try {
|
||||||
|
await ensureMic();
|
||||||
|
} catch (e) {
|
||||||
|
alert("Mic permission failed: " + e.message);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$("takesList").textContent = "";
|
||||||
|
refreshUI();
|
||||||
|
|
||||||
|
isRunning = true;
|
||||||
|
$("beginBtn").disabled = true;
|
||||||
|
|
||||||
|
setPill($("speakerState"), `Speaker ${currentSpeaker}/${speakersTotal}`);
|
||||||
|
setPill($("status"), "Listening… say the wake word now", "ok");
|
||||||
|
setPill($("takeState"), "Listening…", "ok");
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
31
cli/train_wake_word → train_wake_word
Executable file → Normal file
31
cli/train_wake_word → train_wake_word
Executable file → Normal file
@@ -3,9 +3,10 @@ set -e
|
|||||||
|
|
||||||
PROGPATH=$(realpath "$0")
|
PROGPATH=$(realpath "$0")
|
||||||
PROGDIR=$(dirname "${PROGPATH}")
|
PROGDIR=$(dirname "${PROGPATH}")
|
||||||
|
CLIDIR="${PROGDIR}/cli"
|
||||||
|
|
||||||
KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
|
KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir )
|
||||||
source "${PROGDIR}/shell.functions"
|
source "${CLIDIR}/shell.functions"
|
||||||
WAKE_WORD=${POSITIONAL_ARGS[0]}
|
WAKE_WORD=${POSITIONAL_ARGS[0]}
|
||||||
|
|
||||||
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then
|
||||||
@@ -62,7 +63,7 @@ fi
|
|||||||
|
|
||||||
printf "%-80s\n" "=" | tr ' ' "="
|
printf "%-80s\n" "=" | tr ' ' "="
|
||||||
echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
|
echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training ====="
|
||||||
"${PROGDIR}/cudainfo"
|
"${CLIDIR}/cudainfo"
|
||||||
echo
|
echo
|
||||||
START_TS=$EPOCHSECONDS
|
START_TS=$EPOCHSECONDS
|
||||||
|
|
||||||
@@ -75,17 +76,13 @@ export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512
|
|||||||
export GLOG_minloglevel=2
|
export GLOG_minloglevel=2
|
||||||
export GRPC_VERBOSITY=ERROR
|
export GRPC_VERBOSITY=ERROR
|
||||||
|
|
||||||
|
"${CLIDIR}/wake_word_sample_generator" \
|
||||||
"${PROGDIR}/wake_word_sample_generator" \
|
|
||||||
--samples=${SAMPLES} \
|
--samples=${SAMPLES} \
|
||||||
--batch-size=${BATCH_SIZE} \
|
--batch-size=${BATCH_SIZE} \
|
||||||
--data-dir="${DATA_DIR}" "${WAKE_WORD}"
|
--data-dir="${DATA_DIR}" "${WAKE_WORD}"
|
||||||
|
|
||||||
POST_GEN_TS=$EPOCHSECONDS
|
POST_GEN_TS=$EPOCHSECONDS
|
||||||
|
|
||||||
ww="${WAKE_WORD// /_}"
|
|
||||||
ww="${ww//./}"
|
|
||||||
|
|
||||||
AUGMENT=false
|
AUGMENT=false
|
||||||
GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
|
GENERATED_DIR="${DATA_DIR}/work/wake_word_samples"
|
||||||
AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
||||||
@@ -96,7 +93,7 @@ AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented"
|
|||||||
if ${AUGMENT} ; then
|
if ${AUGMENT} ; then
|
||||||
rm -rf "${AUGMENTED_DIR}" || :
|
rm -rf "${AUGMENTED_DIR}" || :
|
||||||
mkdir -p "${AUGMENTED_DIR}" || :
|
mkdir -p "${AUGMENTED_DIR}" || :
|
||||||
"${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
"${CLIDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; }
|
||||||
else
|
else
|
||||||
echo "Augmentation not required"
|
echo "Augmentation not required"
|
||||||
echo
|
echo
|
||||||
@@ -104,18 +101,26 @@ fi
|
|||||||
|
|
||||||
POST_AUGMENT_TS=$EPOCHSECONDS
|
POST_AUGMENT_TS=$EPOCHSECONDS
|
||||||
|
|
||||||
"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \
|
"${CLIDIR}/wake_word_sample_trainer" \
|
||||||
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
--samples=${SAMPLES} \
|
||||||
|
--training-steps=${TRAINING_STEPS} \
|
||||||
|
--data-dir="${DATA_DIR}" \
|
||||||
|
"${WAKE_WORD}" "${WAKE_WORD_TITLE}"
|
||||||
|
|
||||||
if ${CLEANUP_WORK_DIR} ; then
|
if ${CLEANUP_WORK_DIR} ; then
|
||||||
rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \
|
rm -rf \
|
||||||
"${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || :
|
"${DATA_DIR}/work/trained_models" \
|
||||||
|
"${DATA_DIR}/work/wake_word_samples" \
|
||||||
|
"${DATA_DIR}/work/wake_word_samples_augmented" \
|
||||||
|
"${DATA_DIR}/work/personal_augmented_features" \
|
||||||
|
"${DATA_DIR}/work/last_wake_word" || :
|
||||||
fi
|
fi
|
||||||
|
|
||||||
END_TS=$EPOCHSECONDS
|
END_TS=$EPOCHSECONDS
|
||||||
|
|
||||||
python -c $'print(f"{\'=\' * 80}")'
|
python -c $'print(f"{\'=\' * 80}")'
|
||||||
printf "%44s\n\n" "Training Summary"
|
printf "%44s\n\n" "Training Summary"
|
||||||
"${PROGDIR}/system_summary"
|
"${CLIDIR}/system_summary"
|
||||||
echo
|
echo
|
||||||
print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
|
print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch"
|
||||||
print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
|
print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples"
|
||||||
Reference in New Issue
Block a user