diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..545370a
Binary files /dev/null and b/.DS_Store differ
diff --git a/cli/.bashrc b/.bashrc
similarity index 100%
rename from cli/.bashrc
rename to .bashrc
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/README.md b/README.md
index 75eefd8..359d73d 100644
--- a/README.md
+++ b/README.md
@@ -1,123 +1,507 @@
-
-
-
microWakeWord Trainer Docker
-
+# Run training from the command line
-# š„ MicroWakeWord Trainer ā Tater Approved
+## Overview
-**ā Tater Totterson tested & working on an NVIDIA RTX 3070 Laptop GPU (8 GB VRAM).**
-Easily train microWakeWord detection models with this pre-built Docker image and JupyterLab notebook.
+With these scripts and Dockerfile, you can train new wake words from the
+command line without using a Jupyter notebook.
----
+Differences between this Docker image and the Jupyter notebook image:
-## š Quick Start
+* The Python training environment isn't included in the image. Instead, a
+ "virtual environment" (venv) is created in the `/data` directory which you
+ will have mounted to a host directory. This cuts about 7gb from the image
+ and allows the virtualenv to persist across container instances.
-Follow these steps to get up and running:
+* The logic from the Jupyter notebook is contained in individual Python
+ and shell scripts
-### 1ļøā£ Pull the Pre-Built Docker Image
+* No ports need to be exposed since the Jupyter notebook server isn't being
+ run.
-```bash
-docker pull ghcr.io/tatertotterson/microwakeword:latest
+## TL;DR
+
+For the impatient among you...
+
+```shell
+$ mkdir /some/work/directory # On a device with more than 150GB free space
+$ docker build -t microwakeword-cli:latest .
+$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
+root@mww-cli:/# cd /data
+root@mww-cli:/data# setup_python_venv
+##### You have about 4 minutes to drink coffee
+
+root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
+##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
+
+root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
+##### You have about 30-45 minutes for a nap depending on available system resources.
+##### You'll be informed of where to find your trained model.
```
----
+Load the trained model on your device and give it a try but don't be surprized
+if you get a lot of missed or false activations. Read on to find out why.
-### 2ļøā£ Run the Docker Container
+## Get Started
-```bash
-docker run --rm -it \
- --gpus all \
- -p 8888:8888 \
- -v $(pwd):/data \
- ghcr.io/tatertotterson/microwakeword:latest
+Good, you stuck around! Now read the rest of the document before doing
+anything.
+
+### Using a GPU
+
+Having an Nvidia GPU available can cut the training time by up to half. The
+open-source nouveau driver shipped with Linux kernels doesn't support CUDA
+however so if you have an Nvidia GPU and want to use it for training, you'll
+need to install the official Nvidia driver from
+https://www.nvidia.com/en-in/drivers/unix/
+
+### Build the image
+
+You can use either Docker or Podman as your container management tool.
+`docker` is used in the examples but if you have podman, just substitute
+the command.
+
+Start by navigating to the directory that contains this README file and
+the accompanying Dockerfile. Then...
+
+
+```shell
+docker build -t microwakeword-cli:latest .
```
-**What these flags do:**
-- `--gpus all` ā Enables GPU acceleration
-- `-p 8888:8888` ā Exposes JupyterLab on port 8888
-- `-v $(pwd):/data` ā Saves your work in the current folder
+This should be fairly quick and result in an image that's about 320mb in size
+as it's basically a standard Ubunbtu24.04 image with a few added tools.
----
+So why isn't a pre-built image available for download? Because it'll probably
+take longer to download a pre-built image than for you to create it locally.
+GitHub's container registry is notoriously erratic when it comes to download
+throughput.
-### 3ļøā£ Open JupyterLab
+### Create a host work directory
-Visit [http://localhost:8888](http://localhost:8888) in your browser ā the notebook UI will open.
+This directory will contain the Python virtual environment plus all of the
+downloaded and generated data needed for training and the final trained
+models. A full environment will need about 150gb of free space but read
+further to see how to reduce this.
----
+Your `` will be mounted inside the container as `/data`.
-### 4ļøā£ Set Your Wake Word
+The training container will start a Bash shell so if you have Bash
+aliases or Bashy things you like, create a `.bashrc` file in your
+`` and put them in there. It'll automatically be included
+any time you enter the container.
-At the **top of the notebook**, find this line:
+### Create and start the container
-```bash
-TARGET_WORD = "hey_tater" # Change this to your desired wake word
+There are lots of options that control container creation. The simplest example
+will create the container and give you an interactive shell. When you exit the
+shell, the container will be stopped and removed leaving your ``
+intact.
+
+```shell
+$ docker run -it --rm --gpus=all -v :/data microwakeword-cli:latest
```
-Change `"hey_tater"` to your desired wake word (phonetic spellings often work best).
+Options:
----
+* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
+* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
+ around and give it a name for training more than one wake word. You
+ can stop and remove it when you're ready.
+* Add a `-d` option to start the container in the background and use `docker
+ attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
-### 5ļøā£ Run the Notebook
+When the container starts, you'll see:
-Run all cells in the notebook. This process will:
-- Generate wake word samples
-- Train a detection model
-- Output a quantized `.tflite` model ready for on-device use
+```text
+=======================================================
+WARNING: A python virtual environment wasn't found
+at /data/.venv. You'll need to run setup_python_venv
+before you'll be able to use this container for
+training.
+=======================================================
+root@mww-cli:/#
+```
----
+Don't worry about the python WARNING right now. You'll be creating the
+virtualenv in the next step.
-### 6ļøā£ Retrieve the Trained Model & JSON
+If you've forgotton to create and/or mount your host data directory, you'll
+see an additional warning:
-When training finishes, download links for both the `.tflite` model and its `.json` manifest will be displayed in the last cell.
+```text
+=======================================================
+WARNING: The /data directory is NOT mounted.
+Running the training process without /data mounted
+could add over 140Gb of python packages and training
+files to this container's storage which is probably
+NOT what you want.
----
+You should remove this container and re-create it with
+a 'docker run' option like '-v :/data'
+making sure the host directory is on a device that has
+enough free space.
+=======================================================
+```
-## š Resetting to a Clean State
+You can certainly continue but it's a "really bad idea"⢠because your
+container storage could grow from a few hundred mb to over 140gb.
-If you need to start fresh:
+At this point, you're in a Bash shell.
-1. Delete the `data` folder that was mapped to your Docker container.
-2. Restart the container using the steps above.
-3. A fresh copy of the notebook will be placed into the `data` directory.
+### Create the Python virtual environment
----
+The Python virtual environment will contain all the software needed to train.
+It gets created as `/data/.venv` and will take up about 11gb of disk space.
-## š¤ Optional: Personal Voice Samples
+The scripts that do all the work will be in the container's PATH so to setup
+the virtual environment and install all of the packages, just run:
-In addition to synthetic TTS samples, the trainer can optionally use your own real voice recordings to significantly improve accuracy for your voice and environment.
+```text
+setup_python_venv [ --verbose ]
-### How it works
-- If a folder named personal_samples/ exists and contains .wav files, the trainer will:
- - Automatically extract features from those recordings
- - Include them during training alongside the synthetic TTS data
- - Up-weight your personal samples during training for better real-world performance
+Options:
-No extra flags or configuration are required ā it is detected automatically.
+--verbose: Print the detailed "pip install" output.
-### How to use it
-1. Create a folder in the repo root:
- mkdir personal_samples
+```
-2. Record yourself saying the wake word naturally and save the files as .wav:
- personal_samples/
- hey_tater_01.wav
- hey_tater_02.wav
- hey_tater_03.wav
- ...
+When the installation is finished, a test of the major components will be
+run.
+
+Once the process is done, you should change to the `/data` directory and
+activate the virtual environment with:
+
+```shell
+root@mww-cli:/# cd /data
+root@mww-cli:/data# source .venv/bin/activate
+(.venv) root@mww-cli:/data#
+```
+
+Technically, you don't need to do either of these since the scripts
+are in the PATH and they know to use the `/data` directory for everything.
+It's more of an "if you're interested" thing.
+
+At this point, you have a container with all software installed.
+
+## Get the reference data
+
+The training process itself relies on a significant amount of audio reference
+data that creates a simulated "audio environment" that your wake word will be
+trained in. These "training datasets" include things like varying amounts of
+reverberation, background music, background conversations, background noise,
+etc. All said and done, it amounts to about 30gb of audio but with the
+downloaded archives and extracted intermediate files, you'll need about 85gb
+of free space. Thankfully, you only need to download the files once no
+matter how many wake words you want to train and since it's stored in
+`/data`, you can even remove the docker container and recreate it without
+losing any of it. There are 4 datasets that are required.
+
+This is a three stage process...
+
+1. Download zipfiles or tarballs. (about 30gb)
+2. Extract them. (about 50gb)
+3. Convert them into the final form. (about 31gb)
+
+NOTE: The sizes add up to more than the 85gb stated earlier because one
+of the datasets doesn't need to be covnerted and is counted in both
+steps 2 and 3. You really do only need 85gb.
+
+To download the archives, unpack them, and convert the audio to what's needed
+by the training process, run:
+
+```text
+setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
+
+Options:
+--cleanup-archives: Automatically delete the tarballs or zipfiles after
+ they've been extracted.
+
+--cleanup-intermediate-files: Automatically delete the intermediate files
+ after they've been converted.
+
+```
+
+On a 1gb/sec Internet connection, this will take about 25 minutes.
+
+The script detects if the datasets have already been downloaded, extracted
+and/or converted and skips those steps as appropriate so if you've run the
+script without the cleanup options, you can just run it again with those
+options to clean them up.
+
+Now you're ready to train a wake word. Almost.
+
+## Train a Wake Word
+
+Training is done in 3 stages.
+
+1. Generate thousands of samples of the wake word with various voices,
+pitches, speeds, inflections, etc.
+2. Augment the samples with the training datasets to add background noise, etc.
+3. Run the Tensorflow training.
+
+### Generate a sample for verification
+
+Before you start the full process, you're going to want to generate a single
+wake word sample and play it back to ensure it sounds right. The wake word
+should be spelled phonetically to give the sample generator the best chance
+of success.
+
+```text
+root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
+===== Generating 1 sample of 'hey buster' =====
+ Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
+ Successfully loaded the model
+ Batch 1/0 complete
+ Done
+Sample available at /data/work/test_sample/hey_buster.wav
+Play it from your host.
+```
+
+You should then play that file from your host. The reason I used "hey buster"
+as the wake word is to demonstrate why it's important to generate and listen
+to a sample. If you try that exact input and play it back, you'll notice
+that the generator didn't capture the "er" at the end very well. To get it to
+do so, I had to add a period on the end as a "spacer".
+"hey buster." worked much better.
+
+When you're happy with the sample, you can run the full process.
+
+### Run the full training process
+
+```text
+train_wake_word [ --samples= ] [ --batch-size= ]
+ [ --training-steps= ] [ --cleanup-work-dir ]
+ [ ]
+
+Options:
+--samples: The number of samples to generate for the wake word.
+ Default: 20000
+
+--batch-size: How many samples should be generated at a time. The more
+ samples, the more memory is needed.
+ Default: 100
+
+--training-steps: Number of training steps. More training steps means better
+ detection and false positive rates but also more time to train.
+ Default: 25000
+
+--cleanup-work-dir: Delete the /data/work directory after successful training.
+ Default: false
+
+ The word to train spelled phonetically.
+ Required.
+
+ An optional pretty name to save to the json metadata file.
+ Default: The wake word with individual words capitalized
+ and punctuation removed.
+
+```
+
+By default, the training process creates 20,000 samples of your wake word and
+runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
+in the [Extra Credit](#extra-credit) section below for
+why these are the defaults. Depending on resources available, this could take
+between 30 and 60 minutes.
+
+The resulting tflite model files and logs will be placed in the
+`/data/output/---` directory
+and will therefore be available from your host in the directory you mapped
+`/data` to. File names will have non-filename-friendly characters in your
+wake word changed to underscores to make things easier. You'll need both the
+tflite and json files to load on your device. Exactly how you load them
+depends on the device and is beyond the scope of this project.
+
+The only real measure of success is how well the resulting model works
+on a real device. If you encounter too many missed or false activations,
+increasing the number of samples would probably improve the results more
+than increasing the number of training steps. See
+[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
+
+The output from the last step is filtered some by the script but still quite
+verbose. The full log will be available in the output directory as
+`training.log` if you're interested. Intepreting the log is beyond the scope
+of this project however.
+
+You can train additional wake words or change the number of samples and
+training steps by simply running `train_wake_word` again. No need to repeat
+any of the earlier setup steps. If you change the wake word or the number of
+wake word samples, the work directory will be deleted and all 3 steps re-run.
+If you only change the number of training steps, the data from the first two
+steps is still valid and only the 3rd step is run.
+
+All of the intermediate data is stored in the `/data/work` directory which will
+grow to about 17gb with 20,000 wake word samples. Once the tflite model is
+successfully generated and you're happy with the results, you can delete the
+`/data/work` directory.
+
+### Training more than one wake word
+
+Once you have a container running, you
+can easily train multiple wake words from your host:
+
+```shell
+for wp in "hey_alexa" "hey_jenkins" ; do
+ docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
+done
+```
+
+### Training time examples
+
+Training times depend on lots of things. These are examples only.
+Your Mileage May Vary!!!
+
+```text
+===============================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: N/A
+
+ Generate 10000 samples, 100/batch Elapsed time: 0:06:17
+ Augment 10000 samples Elapsed time: 0:04:05
+ 10000 training steps Elapsed time: 0:15:04
+ ==================================================
+ Total Elapsed time: 0:25:26
+================================================================================
+
+================================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
+
+ Generate 10000 samples, 100/batch Elapsed time: 0:00:29
+ Augment 10000 samples Elapsed time: 0:03:40
+ 10000 training steps Elapsed time: 0:08:00
+ ======================================================
+ Total Elapsed time: 0:12:09
+================================================================================
+
+================================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: N/A
+
+ Generate 20000 samples, 100/batch Elapsed time: 0:10:38
+ Augment 20000 samples Elapsed time: 0:07:04
+ 25000 training steps Elapsed time: 0:25:21
+ ======================================================
+ Total Elapsed time: 0:43:03
+================================================================================
+
+================================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
+
+ Generate 20000 samples, 100/batch Elapsed time: 0:00:53
+ Augment 20000 samples Elapsed time: 0:07:05
+ 25000 training steps Elapsed time: 0:19:13
+ ======================================================
+ Total Elapsed time: 0:27:11
+================================================================================
+
+================================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: N/A
+
+ Generate 50000 samples, 100/batch Elapsed time: 0:30:47
+ Augment 50000 samples Elapsed time: 0:20:22
+ 40000 training steps Elapsed time: 1:01:51
+ ==================================================
+ Total Elapsed time: 1:53:00
+================================================================================
+
+================================================================================
+ Training Summary
+
+CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
+GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
+
+ Generate 50000 samples, 100/batch Elapsed time: 0:02:08
+ Augment 50000 samples Elapsed time: 0:19:13
+ 40000 training steps Elapsed time: 0:42:23
+ ======================================================
+ Total Elapsed time: 1:03:44
+================================================================================
+
+
+```
+
+The sample generation process is really the only one that uses multiple CPUs so
+having fewer CPU threads available will probably make little difference.
+
+## Extra Credit
+
+### Training defaults
+
+If you plan on training multiple wake words, you can set your own default
+training parameters by creating a `/data/.defaults.env` file with the
+following contents:
+
+```shell
+# Variable names follow the command line parameters converted to upper case
+# and with the dashes ('-') converted to underscores ('_').
+export SAMPLES=10000
+export TRAINING_STEPS=10000
+
+# Don't use the GPU for any operations. Stick with the CPU only.
+##export CUDA_VISIBLE_DEVICES=-1
+
+```
+
+### Examine your model with Tensorboard
+
+Tensorboard is a web-based graphical model viewer. You can use it to get an
+idea of how many training steps are needed before accuracy results stop
+improving. To use it, you'll have to expose port 6006 by adding `-p
+6006:6006` to your `docker run` command line. If you didn't, don't worry.
+Remember, the /data directory is mapped to a directory on your host so you
+can simply stop and delete the current container and recreate it with the new
+`docker run` command. No need to re-run any of the setup or training steps.
+
+To start Tensorboard, run:
+
+```shell
+root@mww-cli:/# cd /data
+root@mww-cli:/data# source .venv/bin/activate
+(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
+```
+
+Now on your host, point your browser at `http://localhost:6006/`,
+click "SCALARS" at the top and take a look at the various charts. You'll see
+a "train" and "validation" item for each training run you've performed. It's
+the "train" items you're interested in.
+
+
+
+You have to be a Tensorflow expert to decipher most of the charts but
+the "Accuracy" chart for this particular wake word and 50,000 samples would
+seem to idicate that there's very little improvement after about 20,000
+training steps.
+
+
+
+In contrast, with only 5,000 wake word samples, there's still improvement to be had after
+20,000 training steps.
+
+
+
+Given that it's faster to generate wake word samples than it is to train,
+20,000 samples and 25,000 training steps seems like a good compromise. This
+chart has a bit less smoothing to show a bit more detail and includes the
+50,000 sample run as well. This run took only 27 minutes as opposed to the
+63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
+25,000 are the defaults for these scripts.
+
+
-3. Run the training script as normal:
-If personal samples are found, youāll see a message during training indicating they are being included.
-### Recording tips
-- 10ā30 recordings is usually enough to see a noticeable improvement
-- Vary distance, volume, and tone slightly
-- Record in the same environment where the wake word will be used (room noise matters)
-- Use 16-bit WAV files if possible (most recorders do this by default)
----
-## š Credits
-This project builds upon the excellent work of [kahrendt/microWakeWord](https://github.com/kahrendt/microWakeWord).
-Huge thanks to the original authors for their contributions to the open-source community!
diff --git a/cli/.DS_Store b/cli/.DS_Store
new file mode 100644
index 0000000..81f16e0
Binary files /dev/null and b/cli/.DS_Store differ
diff --git a/cli/Dockerfile b/cli/Dockerfile
deleted file mode 100644
index c460d93..0000000
--- a/cli/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-# Since this is a pure python environment, we don't need to start
-# with a huge CUDA image. A standard Ubuntu image will do.
-FROM ubuntu:24.04
-
-ENV DEBIAN_FRONTEND=noninteractive \
- PYTHONUNBUFFERED=1 \
- PIP_NO_CACHE_DIR=1 \
- PIP_ROOT_USER_ACTION=ignore \
- HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
- PATH="/root/mww-scripts:${PATH}"
-
-# System deps
-RUN apt-get update && apt-get install -y --no-install-recommends \
- python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
- git wget curl unzip ca-certificates nano less \
- && rm -rf /var/lib/apt/lists/* \
- && mkdir -p /data
-
-COPY --chown=root:root --chmod=0755 .bashrc /root/
-COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \
- test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/
-
-# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash
-# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop"
-# to timeout then SIGKILL the container.
-# This little scriptlet causes bash to exit immediately when it receives the SIGTERM.
-CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ]
diff --git a/cli/README.md b/cli/README.md
deleted file mode 100644
index 359d73d..0000000
--- a/cli/README.md
+++ /dev/null
@@ -1,507 +0,0 @@
-# Run training from the command line
-
-## Overview
-
-With these scripts and Dockerfile, you can train new wake words from the
-command line without using a Jupyter notebook.
-
-Differences between this Docker image and the Jupyter notebook image:
-
-* The Python training environment isn't included in the image. Instead, a
- "virtual environment" (venv) is created in the `/data` directory which you
- will have mounted to a host directory. This cuts about 7gb from the image
- and allows the virtualenv to persist across container instances.
-
-* The logic from the Jupyter notebook is contained in individual Python
- and shell scripts
-
-* No ports need to be exposed since the Jupyter notebook server isn't being
- run.
-
-## TL;DR
-
-For the impatient among you...
-
-```shell
-$ mkdir /some/work/directory # On a device with more than 150GB free space
-$ docker build -t microwakeword-cli:latest .
-$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest
-root@mww-cli:/# cd /data
-root@mww-cli:/data# setup_python_venv
-##### You have about 4 minutes to drink coffee
-
-root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files
-##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection)
-
-root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word"
-##### You have about 30-45 minutes for a nap depending on available system resources.
-##### You'll be informed of where to find your trained model.
-```
-
-Load the trained model on your device and give it a try but don't be surprized
-if you get a lot of missed or false activations. Read on to find out why.
-
-## Get Started
-
-Good, you stuck around! Now read the rest of the document before doing
-anything.
-
-### Using a GPU
-
-Having an Nvidia GPU available can cut the training time by up to half. The
-open-source nouveau driver shipped with Linux kernels doesn't support CUDA
-however so if you have an Nvidia GPU and want to use it for training, you'll
-need to install the official Nvidia driver from
-https://www.nvidia.com/en-in/drivers/unix/
-
-### Build the image
-
-You can use either Docker or Podman as your container management tool.
-`docker` is used in the examples but if you have podman, just substitute
-the command.
-
-Start by navigating to the directory that contains this README file and
-the accompanying Dockerfile. Then...
-
-
-```shell
-docker build -t microwakeword-cli:latest .
-```
-
-This should be fairly quick and result in an image that's about 320mb in size
-as it's basically a standard Ubunbtu24.04 image with a few added tools.
-
-So why isn't a pre-built image available for download? Because it'll probably
-take longer to download a pre-built image than for you to create it locally.
-GitHub's container registry is notoriously erratic when it comes to download
-throughput.
-
-### Create a host work directory
-
-This directory will contain the Python virtual environment plus all of the
-downloaded and generated data needed for training and the final trained
-models. A full environment will need about 150gb of free space but read
-further to see how to reduce this.
-
-Your `` will be mounted inside the container as `/data`.
-
-The training container will start a Bash shell so if you have Bash
-aliases or Bashy things you like, create a `.bashrc` file in your
-`` and put them in there. It'll automatically be included
-any time you enter the container.
-
-### Create and start the container
-
-There are lots of options that control container creation. The simplest example
-will create the container and give you an interactive shell. When you exit the
-shell, the container will be stopped and removed leaving your ``
-intact.
-
-```shell
-$ docker run -it --rm --gpus=all -v :/data microwakeword-cli:latest
-```
-
-Options:
-
-* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it.
-* Remove the `--rm` and add a `--name=mww-cli` option to keep the container
- around and give it a name for training more than one wake word. You
- can stop and remove it when you're ready.
-* Add a `-d` option to start the container in the background and use `docker
- attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it.
-
-When the container starts, you'll see:
-
-```text
-=======================================================
-WARNING: A python virtual environment wasn't found
-at /data/.venv. You'll need to run setup_python_venv
-before you'll be able to use this container for
-training.
-=======================================================
-root@mww-cli:/#
-```
-
-Don't worry about the python WARNING right now. You'll be creating the
-virtualenv in the next step.
-
-If you've forgotton to create and/or mount your host data directory, you'll
-see an additional warning:
-
-```text
-=======================================================
-WARNING: The /data directory is NOT mounted.
-Running the training process without /data mounted
-could add over 140Gb of python packages and training
-files to this container's storage which is probably
-NOT what you want.
-
-You should remove this container and re-create it with
-a 'docker run' option like '-v :/data'
-making sure the host directory is on a device that has
-enough free space.
-=======================================================
-```
-
-You can certainly continue but it's a "really bad idea"⢠because your
-container storage could grow from a few hundred mb to over 140gb.
-
-At this point, you're in a Bash shell.
-
-### Create the Python virtual environment
-
-The Python virtual environment will contain all the software needed to train.
-It gets created as `/data/.venv` and will take up about 11gb of disk space.
-
-The scripts that do all the work will be in the container's PATH so to setup
-the virtual environment and install all of the packages, just run:
-
-```text
-setup_python_venv [ --verbose ]
-
-Options:
-
---verbose: Print the detailed "pip install" output.
-
-```
-
-When the installation is finished, a test of the major components will be
-run.
-
-Once the process is done, you should change to the `/data` directory and
-activate the virtual environment with:
-
-```shell
-root@mww-cli:/# cd /data
-root@mww-cli:/data# source .venv/bin/activate
-(.venv) root@mww-cli:/data#
-```
-
-Technically, you don't need to do either of these since the scripts
-are in the PATH and they know to use the `/data` directory for everything.
-It's more of an "if you're interested" thing.
-
-At this point, you have a container with all software installed.
-
-## Get the reference data
-
-The training process itself relies on a significant amount of audio reference
-data that creates a simulated "audio environment" that your wake word will be
-trained in. These "training datasets" include things like varying amounts of
-reverberation, background music, background conversations, background noise,
-etc. All said and done, it amounts to about 30gb of audio but with the
-downloaded archives and extracted intermediate files, you'll need about 85gb
-of free space. Thankfully, you only need to download the files once no
-matter how many wake words you want to train and since it's stored in
-`/data`, you can even remove the docker container and recreate it without
-losing any of it. There are 4 datasets that are required.
-
-This is a three stage process...
-
-1. Download zipfiles or tarballs. (about 30gb)
-2. Extract them. (about 50gb)
-3. Convert them into the final form. (about 31gb)
-
-NOTE: The sizes add up to more than the 85gb stated earlier because one
-of the datasets doesn't need to be covnerted and is counted in both
-steps 2 and 3. You really do only need 85gb.
-
-To download the archives, unpack them, and convert the audio to what's needed
-by the training process, run:
-
-```text
-setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ]
-
-Options:
---cleanup-archives: Automatically delete the tarballs or zipfiles after
- they've been extracted.
-
---cleanup-intermediate-files: Automatically delete the intermediate files
- after they've been converted.
-
-```
-
-On a 1gb/sec Internet connection, this will take about 25 minutes.
-
-The script detects if the datasets have already been downloaded, extracted
-and/or converted and skips those steps as appropriate so if you've run the
-script without the cleanup options, you can just run it again with those
-options to clean them up.
-
-Now you're ready to train a wake word. Almost.
-
-## Train a Wake Word
-
-Training is done in 3 stages.
-
-1. Generate thousands of samples of the wake word with various voices,
-pitches, speeds, inflections, etc.
-2. Augment the samples with the training datasets to add background noise, etc.
-3. Run the Tensorflow training.
-
-### Generate a sample for verification
-
-Before you start the full process, you're going to want to generate a single
-wake word sample and play it back to ensure it sounds right. The wake word
-should be spelled phonetically to give the sample generator the best chance
-of success.
-
-```text
-root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster"
-===== Generating 1 sample of 'hey buster' =====
- Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt
- Successfully loaded the model
- Batch 1/0 complete
- Done
-Sample available at /data/work/test_sample/hey_buster.wav
-Play it from your host.
-```
-
-You should then play that file from your host. The reason I used "hey buster"
-as the wake word is to demonstrate why it's important to generate and listen
-to a sample. If you try that exact input and play it back, you'll notice
-that the generator didn't capture the "er" at the end very well. To get it to
-do so, I had to add a period on the end as a "spacer".
-"hey buster." worked much better.
-
-When you're happy with the sample, you can run the full process.
-
-### Run the full training process
-
-```text
-train_wake_word [ --samples= ] [ --batch-size= ]
- [ --training-steps= ] [ --cleanup-work-dir ]
- [ ]
-
-Options:
---samples: The number of samples to generate for the wake word.
- Default: 20000
-
---batch-size: How many samples should be generated at a time. The more
- samples, the more memory is needed.
- Default: 100
-
---training-steps: Number of training steps. More training steps means better
- detection and false positive rates but also more time to train.
- Default: 25000
-
---cleanup-work-dir: Delete the /data/work directory after successful training.
- Default: false
-
- The word to train spelled phonetically.
- Required.
-
- An optional pretty name to save to the json metadata file.
- Default: The wake word with individual words capitalized
- and punctuation removed.
-
-```
-
-By default, the training process creates 20,000 samples of your wake word and
-runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results)
-in the [Extra Credit](#extra-credit) section below for
-why these are the defaults. Depending on resources available, this could take
-between 30 and 60 minutes.
-
-The resulting tflite model files and logs will be placed in the
-`/data/output/---` directory
-and will therefore be available from your host in the directory you mapped
-`/data` to. File names will have non-filename-friendly characters in your
-wake word changed to underscores to make things easier. You'll need both the
-tflite and json files to load on your device. Exactly how you load them
-depends on the device and is beyond the scope of this project.
-
-The only real measure of success is how well the resulting model works
-on a real device. If you encounter too many missed or false activations,
-increasing the number of samples would probably improve the results more
-than increasing the number of training steps. See
-[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below.
-
-The output from the last step is filtered some by the script but still quite
-verbose. The full log will be available in the output directory as
-`training.log` if you're interested. Intepreting the log is beyond the scope
-of this project however.
-
-You can train additional wake words or change the number of samples and
-training steps by simply running `train_wake_word` again. No need to repeat
-any of the earlier setup steps. If you change the wake word or the number of
-wake word samples, the work directory will be deleted and all 3 steps re-run.
-If you only change the number of training steps, the data from the first two
-steps is still valid and only the 3rd step is run.
-
-All of the intermediate data is stored in the `/data/work` directory which will
-grow to about 17gb with 20,000 wake word samples. Once the tflite model is
-successfully generated and you're happy with the results, you can delete the
-`/data/work` directory.
-
-### Training more than one wake word
-
-Once you have a container running, you
-can easily train multiple wake words from your host:
-
-```shell
-for wp in "hey_alexa" "hey_jenkins" ; do
- docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp"
-done
-```
-
-### Training time examples
-
-Training times depend on lots of things. These are examples only.
-Your Mileage May Vary!!!
-
-```text
-===============================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: N/A
-
- Generate 10000 samples, 100/batch Elapsed time: 0:06:17
- Augment 10000 samples Elapsed time: 0:04:05
- 10000 training steps Elapsed time: 0:15:04
- ==================================================
- Total Elapsed time: 0:25:26
-================================================================================
-
-================================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
-
- Generate 10000 samples, 100/batch Elapsed time: 0:00:29
- Augment 10000 samples Elapsed time: 0:03:40
- 10000 training steps Elapsed time: 0:08:00
- ======================================================
- Total Elapsed time: 0:12:09
-================================================================================
-
-================================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: N/A
-
- Generate 20000 samples, 100/batch Elapsed time: 0:10:38
- Augment 20000 samples Elapsed time: 0:07:04
- 25000 training steps Elapsed time: 0:25:21
- ======================================================
- Total Elapsed time: 0:43:03
-================================================================================
-
-================================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
-
- Generate 20000 samples, 100/batch Elapsed time: 0:00:53
- Augment 20000 samples Elapsed time: 0:07:05
- 25000 training steps Elapsed time: 0:19:13
- ======================================================
- Total Elapsed time: 0:27:11
-================================================================================
-
-================================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: N/A
-
- Generate 50000 samples, 100/batch Elapsed time: 0:30:47
- Augment 50000 samples Elapsed time: 0:20:22
- 40000 training steps Elapsed time: 1:01:51
- ==================================================
- Total Elapsed time: 1:53:00
-================================================================================
-
-================================================================================
- Training Summary
-
-CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb
-GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb
-
- Generate 50000 samples, 100/batch Elapsed time: 0:02:08
- Augment 50000 samples Elapsed time: 0:19:13
- 40000 training steps Elapsed time: 0:42:23
- ======================================================
- Total Elapsed time: 1:03:44
-================================================================================
-
-
-```
-
-The sample generation process is really the only one that uses multiple CPUs so
-having fewer CPU threads available will probably make little difference.
-
-## Extra Credit
-
-### Training defaults
-
-If you plan on training multiple wake words, you can set your own default
-training parameters by creating a `/data/.defaults.env` file with the
-following contents:
-
-```shell
-# Variable names follow the command line parameters converted to upper case
-# and with the dashes ('-') converted to underscores ('_').
-export SAMPLES=10000
-export TRAINING_STEPS=10000
-
-# Don't use the GPU for any operations. Stick with the CPU only.
-##export CUDA_VISIBLE_DEVICES=-1
-
-```
-
-### Examine your model with Tensorboard
-
-Tensorboard is a web-based graphical model viewer. You can use it to get an
-idea of how many training steps are needed before accuracy results stop
-improving. To use it, you'll have to expose port 6006 by adding `-p
-6006:6006` to your `docker run` command line. If you didn't, don't worry.
-Remember, the /data directory is mapped to a directory on your host so you
-can simply stop and delete the current container and recreate it with the new
-`docker run` command. No need to re-run any of the setup or training steps.
-
-To start Tensorboard, run:
-
-```shell
-root@mww-cli:/# cd /data
-root@mww-cli:/data# source .venv/bin/activate
-(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output
-```
-
-Now on your host, point your browser at `http://localhost:6006/`,
-click "SCALARS" at the top and take a look at the various charts. You'll see
-a "train" and "validation" item for each training run you've performed. It's
-the "train" items you're interested in.
-
-
-
-You have to be a Tensorflow expert to decipher most of the charts but
-the "Accuracy" chart for this particular wake word and 50,000 samples would
-seem to idicate that there's very little improvement after about 20,000
-training steps.
-
-
-
-In contrast, with only 5,000 wake word samples, there's still improvement to be had after
-20,000 training steps.
-
-
-
-Given that it's faster to generate wake word samples than it is to train,
-20,000 samples and 25,000 training steps seems like a good compromise. This
-chart has a bit less smoothing to show a bit more detail and includes the
-50,000 sample run as well. This run took only 27 minutes as opposed to the
-63 minutes it took for the 50,000 sample run. Now you know why 20,000 and
-25,000 are the defaults for these scripts.
-
-
-
-
-
-
-
-
diff --git a/cli/requirements.txt b/cli/requirements.txt
deleted file mode 100644
index a0e801b..0000000
--- a/cli/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# --- Packages needed by our scripts ---
-
-numpy==1.26.4
-scipy==1.12.0
-librosa==0.10.2.post1
-soundfile==0.12.1
-tqdm==4.67.1
-scikit-learn==1.6.0
-numba==0.63.1
-PyYAML==6.0.3
diff --git a/cli/setup_python_venv b/cli/setup_python_venv
index 153d43d..4a77557 100755
--- a/cli/setup_python_venv
+++ b/cli/setup_python_venv
@@ -1,5 +1,6 @@
#!/bin/bash
-PROGDIR="$(dirname $(realpath $0))"
+PROGDIR="$(dirname "$(realpath "$0")")"
+ROOTDIR="$(dirname "${PROGDIR}")"
KNOWN_ARGS=( data-dir python gpu no-gpu )
source "${PROGDIR}/shell.functions"
@@ -27,7 +28,7 @@ EOF
exit 1
fi
-[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})"
+[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
[ -d "${DATA_DIR}" ] || {
echo "Data directory '${DATA_DIR}' doesn't exist." >&2
exit 1
@@ -52,7 +53,8 @@ if [ -n "${PYTHON}" ] ; then
PYTHONS=( "${PYTHON}" )
unset PYTHON
else
- PYTHONS=( python3.12 python3.10 )
+ # Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04)
+ PYTHONS=( python3.12 python3.11 python3.10 )
fi
for p in "${PYTHONS[@]}" ; do
@@ -60,14 +62,14 @@ for p in "${PYTHONS[@]}" ; do
done
[ -n "${PYTHON}" ] || {
- echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2
+ echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2
exit 1
}
-if [ -d "${VENV}" ] ; then
+if [ -d "${VENV}" ] ; then
if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then
source "${VENV}/bin/activate" || {
- echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
+ echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2
exit 1
}
else
@@ -82,24 +84,28 @@ if [ -z "$VIRTUAL_ENV" ] ; then
else
echo " ===== Updating virtualenv at '${VENV}' ====="
fi
+
${PYTHON} -m venv --upgrade-deps "${VENV}"
source "${VENV}/bin/activate"
set -euo pipefail
-declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) )
+# Symlink CLI scripts into .venv/bin
+declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) )
progfiles+=( "${PROGDIR}/shell.functions" )
+# Also symlink the top-level entrypoint if present
+[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" )
+
for f in "${progfiles[@]}" ; do
- ln -sfr "${f}" ".venv/bin/$(basename ${f})"
+ ln -sfr "${f}" ".venv/bin/$(basename "${f}")"
done
#
-# Pip doesn't process packages from requirements.txt in
-# order but order is important because tensorflow, torch,
-# onnxruntime and micro-wake-word all depend on CUDA packages
-# at various versions. They need to be installed in this specific
-# order or they may not be able to use the GPU.
+# Pip doesn't process packages from requirements.txt in order but order is
+# important because tensorflow, torch, onnxruntime and micro-wake-word all
+# depend on CUDA packages at various versions. They need to be installed in
+# this specific order or they may not be able to use the GPU.
#
export PIP_PROGRESS_BAR=off
export PIP_NO_COLOR=1
@@ -117,7 +123,8 @@ pip_install() {
START_TS=$EPOCHSECONDS
echo " ===== Installing common requirements ====="
-pip_install -r "${PROGDIR}/requirements.txt"
+# requirements.txt lives in repo root now
+pip_install -r "${ROOTDIR}/requirements.txt"
${GPU} && tfgpu='[and-cuda]' || tfgpu=""
echo " ===== Installing Tensorflow${tfgpu} ====="
@@ -140,7 +147,7 @@ pip_install -e "${MWW}"
echo " ===== Checking piper-sample-generator ====="
PSG="${DATA_DIR}/tools/piper-sample-generator"
-if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then
+if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then
rm -rf "${PSG}" || :
echo " Cloning piper-sample-generator to ${DATA_DIR}/tools"
git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null
@@ -171,13 +178,11 @@ echo " ===== Installing keras ====="
# keras 3.13 has "issues" so we need to back down to 3.12.
pip_install "keras==3.12.0"
-${PROGDIR}/test_python --data-dir="${DATA_DIR}"
+"${PROGDIR}/test_python" --data-dir="${DATA_DIR}"
touch .mww-data-dir
END_TS=$EPOCHSECONDS
echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell."
-print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
-
-
+print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete"
\ No newline at end of file
diff --git a/cli/setup_training_datasets b/cli/setup_training_datasets
index fc6e280..c343e95 100755
--- a/cli/setup_training_datasets
+++ b/cli/setup_training_datasets
@@ -1,8 +1,9 @@
#!/bin/bash
set -euo pipefail
-PROGPATH=$(realpath "$0")
-PROGDIR=$(dirname "${PROGPATH}")
+PROGPATH="$(realpath "$0")"
+PROGDIR="$(dirname "${PROGPATH}")"
+ROOTDIR="$(dirname "${PROGDIR}")" # repo root (train_wake_word, requirements.txt, etc.)
KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files )
source "${PROGDIR}/shell.functions"
@@ -27,22 +28,38 @@ EOF
exit 1
fi
+# Normalize + validate DATA_DIR (shell.functions typically sets a default,
+# but this makes the script standalone-safe)
+[ -n "${DATA_DIR:-}" ] && DATA_DIR="$(realpath "${DATA_DIR}")"
+[ -d "${DATA_DIR}" ] || {
+ echo "Data directory '${DATA_DIR}' doesn't exist." >&2
+ exit 1
+}
+
cd "${DATA_DIR}"
START_TS=$EPOCHSECONDS
echo -e "\n===== Setting up Training Datasets =====\n"
-${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \
- --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_negative_datasets" \
+ --cleanup-archives="${CLEANUP_ARCHIVES}" \
+ --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+ --data-dir="${DATA_DIR}"
-${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \
- --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_mit_audio" \
+ --cleanup-archives="${CLEANUP_ARCHIVES}" \
+ --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+ --data-dir="${DATA_DIR}"
-${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \
- --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_audioset" \
+ --cleanup-archives="${CLEANUP_ARCHIVES}" \
+ --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+ --data-dir="${DATA_DIR}"
-${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \
- --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}"
+"${PROGDIR}/setup_fma" \
+ --cleanup-archives="${CLEANUP_ARCHIVES}" \
+ --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \
+ --data-dir="${DATA_DIR}"
-END_TS=$(date +%s.%N)
+END_TS=$EPOCHSECONDS
print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup"
diff --git a/cli/tensorboard1.png b/cli/tensorboard1.png
deleted file mode 100644
index a7741d9..0000000
Binary files a/cli/tensorboard1.png and /dev/null differ
diff --git a/cli/tensorboard2.png b/cli/tensorboard2.png
deleted file mode 100644
index 9042fdc..0000000
Binary files a/cli/tensorboard2.png and /dev/null differ
diff --git a/cli/tensorboard3.png b/cli/tensorboard3.png
deleted file mode 100644
index 6df0306..0000000
Binary files a/cli/tensorboard3.png and /dev/null differ
diff --git a/cli/wake_word_sample_augmenter b/cli/wake_word_sample_augmenter
old mode 100755
new mode 100644
diff --git a/cli/wake_word_sample_trainer b/cli/wake_word_sample_trainer
old mode 100755
new mode 100644
diff --git a/dockerfile b/dockerfile
index 52fa989..eb4694c 100644
--- a/dockerfile
+++ b/dockerfile
@@ -1,59 +1,37 @@
-# Standard Ubuntu base image. CUDA base images not needed.
-FROM ubuntu:22.04
+# Base
+FROM ubuntu:24.04
-ENV DEBIAN_FRONTEND=noninteractive \
- PYTHONUNBUFFERED=1 \
- PIP_NO_CACHE_DIR=1 \
- PIP_ROOT_USER_ACTION=ignore \
- HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
- XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" \
- PATH="/usr/local/cuda/bin:${PATH}" \
- LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+ENV DEBIAN_FRONTEND=noninteractive
-# System deps (+dev headers for building C/C++ extensions)
+# System deps
RUN apt-get update && apt-get install -y --no-install-recommends \
- python3.10 python3.10-venv python3.10-distutils python3.10-dev python3-pip \
- git wget curl unzip ca-certificates git-lfs \
- build-essential g++ cmake \
- libsndfile1 libsndfile1-dev libffi-dev \
- ffmpeg \
- && rm -rf /var/lib/apt/lists/*
+ python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \
+ git wget curl unzip ca-certificates nano less \
+ && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /data
-# Use python3.10 everywhere
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \
- && update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+# Recorder port
+EXPOSE 8789
-# ---- No cuDNN repo meddling needed if using TF 2.17.x ----
+# Script root
+WORKDIR /root/mww-scripts
-# Python deps
-# Order is important. onnxruntime, tensorflow and torch have
-# to be installed in the order below or their cuda dependencies
-# will conflict.
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install --upgrade pip \
- && pip install "numpy==1.26.4" "cython>=0.29.36" \
- && pip install -r /tmp/requirements.txt \
- && pip install "onnxruntime-gpu[cuda]>=1.16.0" \
- && pip install "tensorflow[and-cuda]==2.18.0" \
- "tensorboard==2.18.0" \
- "tensorboard-data-server==0.7.2" \
- "tensorflow-io-gcs-filesystem==0.37.1" \
- && pip install \
- torch==2.7.1 \
- torchaudio==2.7.1 \
- --index-url https://download.pytorch.org/whl/cu128
+# Bash environment
+COPY --chown=root:root --chmod=0755 .bashrc /root/
-# Workspace + notebook fallback
-RUN mkdir -p /data
-WORKDIR /data
-COPY microWakeWord_training_notebook.ipynb /root/
+# Root-level entrypoints
+COPY --chown=root:root --chmod=0755 \
+ train_wake_word \
+ run_recorder.sh \
+ recorder_server.py \
+ requirements.txt \
+ /root/mww-scripts/
-# Startup script (copies default notebook if missing)
-COPY startup.sh /usr/local/bin/startup.sh
-RUN chmod +x /usr/local/bin/startup.sh
+# CLI folder (THIS IS THE IMPORTANT CHANGE)
+COPY --chown=root:root cli/ /root/mww-scripts/cli/
-EXPOSE 8888
+# Static UI for recorder
+COPY --chown=root:root --chmod=0644 static/index.html /root/mww-scripts/static/index.html
-CMD ["/bin/bash", "-lc", "/usr/local/bin/startup.sh && \
- exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root \
- --ServerApp.token='' --ServerApp.password='' --ServerApp.root_dir=/data"]
+# recorder server
+CMD ["/bin/bash", "-lc", "/root/mww-scripts/run_recorder.sh"]
diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb
deleted file mode 100644
index 19b54dc..0000000
--- a/microWakeWord_training_notebook.ipynb
+++ /dev/null
@@ -1,1073 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# š„ MicroWakeWord Trainer ā Tater Totterson Edition\n",
- "# ==================================================\n",
- "# Welcome, friend! š This notebook will help you train your very own wake word model.\n",
- "# Think of it like teaching Tater Totterson to recognize when you say a special word.\n",
- "#\n",
- "# By the end, you'll have:\n",
- "# ā A trained TensorFlow Lite model ready for on-device detection.\n",
- "# ā A matching JSON manifest you can drop straight into ESPHome.\n",
- "#\n",
- "# This flow is optimized for Python 3.10 and NVIDIA GPUs (but should work elsewhere too).\n",
- "# You can customize the wake word, play with training parameters, and experiment with\n",
- "# different datasets until you get something that feels just right. šŖ\n",
- "#\n",
- "# ā” Quick Tips:\n",
- "# ⢠Change TARGET_WORD below to whatever you want your wake word to be.\n",
- "# ⢠Rerun the notebook from the top if you change it (to regenerate everything).\n",
- "# ⢠Expect to experiment ā tweaking hyperparameters is part of the fun!\n",
- "#\n",
- "# When youāre done, youāll get two files:\n",
- "# 1ļøā£ .tflite ā your trained model.\n",
- "# 2ļøā£ .json ā a manifest for ESPHome integration.\n",
- "#\n",
- "# More info & examples:\n",
- "# š https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n",
- "\n",
- "# --- Set your wake word here ---\n",
- "TARGET_WORD = \"tater\" # š£ļø Change this to whatever phrase you want!\n",
- "print(f\"š„ Tater Totterson is listening for: '{TARGET_WORD}'\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BFf6511E65ff"
- },
- "outputs": [],
- "source": [
- "import platform\n",
- "import sys\n",
- "import os\n",
- "\n",
- "# mac-only helper deps\n",
- "if platform.system() == \"Darwin\":\n",
- " !\"{sys.executable}\" -m pip install 'git+https://github.com/puddly/pymicro-features@puddly/minimum-cpp-version' --root-user-action=ignore\n",
- "\n",
- "!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n",
- "\n",
- "# š use the actual location in the container\n",
- "repo_path = \"/data/microWakeWord\"\n",
- "\n",
- "if not os.path.exists(repo_path):\n",
- " print(\"ā¬ļø Cloning microWakeWord repository to /dataā¦\")\n",
- " !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n",
- "\n",
- "# optional: pin to a commit\n",
- "# !cd /data/microWakeWord && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\n",
- "\n",
- "if os.path.exists(repo_path):\n",
- " print(\"š¦ Installing microWakeWord...\")\n",
- " !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n",
- "else:\n",
- " print(f\"ā Repository not found at {repo_path}. Clone might have failed.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BFf6511E65ff"
- },
- "outputs": [],
- "source": [
- "# --- GPU Check (Torch + ONNX Runtime) ---\n",
- "\n",
- "import torch\n",
- "import onnxruntime as ort\n",
- "\n",
- "print(\"š§ Torch CUDA Available:\", torch.cuda.is_available())\n",
- "if torch.cuda.is_available():\n",
- " print(\" ⢠Device count:\", torch.cuda.device_count())\n",
- " print(\" ⢠Current device:\", torch.cuda.current_device())\n",
- " print(\" ⢠Device name:\", torch.cuda.get_device_name(torch.cuda.current_device()))\n",
- "else:\n",
- " print(\"ā ļø Torch cannot see a GPU ā check Docker runtime (--gpus all) and nvidia-container-toolkit\")\n",
- "\n",
- "print(\"\\nš§ ONNX Runtime Providers:\")\n",
- "try:\n",
- " providers = ort.get_available_providers()\n",
- " print(\" ā¢\", providers)\n",
- " if \"CUDAExecutionProvider\" not in providers:\n",
- " print(\"ā ļø CUDAExecutionProvider not available ā ONNX will fall back to CPU.\")\n",
- "except Exception as e:\n",
- " print(\"ā ļø Could not query ONNX Runtime providers:\", e)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "dEluu7nL7ywd"
- },
- "outputs": [],
- "source": [
- "# NVIDIA Linux Docker: generate 1 sample of the target word (robust + CUDA check)\n",
- "\n",
- "import os, sys, shutil, subprocess, time, platform\n",
- "from pathlib import Path\n",
- "from IPython.display import Audio, display\n",
- "\n",
- "REPO_URL = \"https://github.com/rhasspy/piper-sample-generator\"\n",
- "REPO_DIR = Path.cwd() / \"piper-sample-generator\"\n",
- "MODELS_DIR = REPO_DIR / \"models\"\n",
- "MODEL_NAME = \"en_US-libritts_r-medium.pt\"\n",
- "MODEL_URL = f\"https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/{MODEL_NAME}\"\n",
- "AUDIO_OUT_DIR = Path.cwd() / \"generated_samples\"\n",
- "AUDIO_PATH = AUDIO_OUT_DIR / \"0.wav\"\n",
- "\n",
- "def run(cmd, check=True):\n",
- " print(\"ā\", \" \".join(cmd))\n",
- " proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)\n",
- " for line in proc.stdout:\n",
- " print(line, end=\"\")\n",
- " rc = proc.wait()\n",
- " if check and rc != 0:\n",
- " raise RuntimeError(f\"Command failed with exit code {rc}: {' '.join(cmd)}\")\n",
- " return rc\n",
- "\n",
- "def pip_install(*pkgs):\n",
- " run([sys.executable, \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"], check=False)\n",
- " run([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n",
- "\n",
- "def safe_clone(repo_url, branch=None, dest=REPO_DIR, retries=2):\n",
- " if dest.exists() and not (dest / \".git\").exists():\n",
- " print(\"ā ļø Found partial clone. Removingā¦\")\n",
- " shutil.rmtree(dest, ignore_errors=True)\n",
- " if not dest.exists():\n",
- " for i in range(retries + 1):\n",
- " try:\n",
- " cmd = [\"git\", \"clone\", \"--depth\", \"1\", repo_url, str(dest)]\n",
- " if branch:\n",
- " cmd = [\"git\", \"clone\", \"--depth\", \"1\", \"--branch\", branch, repo_url, str(dest)]\n",
- " run(cmd)\n",
- " break\n",
- " except Exception as e:\n",
- " if i == retries:\n",
- " raise\n",
- " print(f\"Clone failed ({i+1}/{retries+1}). Retrying in 2s⦠[{e}]\")\n",
- " time.sleep(2)\n",
- "\n",
- "def ensure_model():\n",
- " MODELS_DIR.mkdir(parents=True, exist_ok=True)\n",
- " mp = MODELS_DIR / MODEL_NAME\n",
- " if not mp.exists() or mp.stat().st_size == 0:\n",
- " import urllib.request\n",
- " print(f\"Downloading model to {mp} ā¦\")\n",
- " with urllib.request.urlopen(MODEL_URL) as r, open(mp, \"wb\") as f:\n",
- " shutil.copyfileobj(r, f)\n",
- " if mp.stat().st_size < 100 * 1024:\n",
- " raise RuntimeError(\"Downloaded model looks too small; download may have failed.\")\n",
- " print(f\"ā Model ready: {mp}\")\n",
- "\n",
- "# 1) Clone main repo (Linux/NVIDIA)\n",
- "print(\"Linux/NVIDIA detected ā using main piper-sample-generator repo.\")\n",
- "safe_clone(REPO_URL)\n",
- "\n",
- "# 2) Install deps\n",
- "# - piper-tts provides the `piper` module (required by generate_samples.py)\n",
- "# - piper-phonemize-cross does the phonemization\n",
- "# - onnxruntime-gpu enables CUDA (container must have NVIDIA runtime)\n",
- "deps = [\n",
- " \"piper-tts>=1.2.0\",\n",
- " \"piper-phonemize-cross==1.2.1\",\n",
- " \"soundfile\",\n",
- " \"numpy\",\n",
- " \"onnxruntime-gpu>=1.16.0\",\n",
- "]\n",
- "pip_install(*deps)\n",
- "\n",
- "# 3) Verify CUDA provider is available\n",
- "try:\n",
- " import onnxruntime as ort\n",
- " providers = ort.get_available_providers()\n",
- " print(f\"ONNX Runtime providers: {providers}\")\n",
- " if \"CUDAExecutionProvider\" not in providers:\n",
- " print(\"ā ļø CUDAExecutionProvider not available. \"\n",
- " \"The sample will still run on CPU, but check your NVIDIA container setup \"\n",
- " \"(nvidia-container-toolkit, runtime, and driver).\")\n",
- "except Exception as e:\n",
- " print(\"ā ļø Could not import onnxruntime to verify providers:\", e)\n",
- "\n",
- "# 4) Ensure model present\n",
- "ensure_model()\n",
- "\n",
- "# 5) Generate one sample\n",
- "AUDIO_OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
- "gen_script = REPO_DIR / \"generate_samples.py\"\n",
- "if not gen_script.exists():\n",
- " raise FileNotFoundError(f\"Missing generator: {gen_script}\")\n",
- "\n",
- "cmd = [\n",
- " sys.executable, str(gen_script),\n",
- " TARGET_WORD,\n",
- " \"--model\", str(MODELS_DIR / MODEL_NAME), # ā pass the generator .pt explicitly\n",
- " \"--max-samples\", \"1\",\n",
- " \"--batch-size\", \"1\",\n",
- " \"--output-dir\", str(AUDIO_OUT_DIR),\n",
- "]\n",
- "run(cmd)\n",
- "\n",
- "# 6) Play the audio (if the notebook frontend supports it)\n",
- "if AUDIO_PATH.exists():\n",
- " print(f\"š§ Playing {AUDIO_PATH}\")\n",
- " display(Audio(str(AUDIO_PATH), autoplay=True))\n",
- "else:\n",
- " print(f\"Audio file not found at {AUDIO_PATH}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "-SvGtCCM9akR"
- },
- "outputs": [],
- "source": [
- "# Generate a large number of wake word samples for training (with length-scale sweep)\n",
- "import sys, subprocess\n",
- "from pathlib import Path\n",
- "\n",
- "REPO_DIR = Path.cwd() / \"piper-sample-generator\"\n",
- "MODELS_DIR = REPO_DIR / \"models\"\n",
- "MODEL_NAME = \"en_US-libritts_r-medium.pt\"\n",
- "\n",
- "MAX_SAMPLES = 50000\n",
- "BATCH_SIZE = 100\n",
- "\n",
- "# Piper \"speed\" control via piper-sample-generator is length_scale(s)\n",
- "LENGTH_SCALES = [\"0.85\", \"0.95\", \"1.00\", \"1.05\", \"1.15\"]\n",
- "\n",
- "cmd = [\n",
- " sys.executable,\n",
- " str(REPO_DIR / \"generate_samples.py\"),\n",
- " TARGET_WORD,\n",
- " \"--model\", str(MODELS_DIR / MODEL_NAME),\n",
- " \"--max-samples\", str(MAX_SAMPLES),\n",
- " \"--batch-size\", str(BATCH_SIZE),\n",
- " \"--output-dir\", \"generated_samples\",\n",
- " \"--length-scales\", *LENGTH_SCALES,\n",
- "]\n",
- "\n",
- "print(\"ā\", \" \".join(cmd))\n",
- "subprocess.run(cmd, check=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "YJRG4Qvo9nXG"
- },
- "outputs": [],
- "source": [
- "# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n",
- "# MIT RIR -> resample to 16 kHz\n",
- "# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n",
- "# FMA -> resample to 16 kHz mono\n",
- "\n",
- "import os, sys, subprocess, scipy.io.wavfile, numpy as np\n",
- "from pathlib import Path\n",
- "from tqdm import tqdm\n",
- "import soundfile as sf\n",
- "import librosa\n",
- "from datasets import load_dataset\n",
- "\n",
- "# -------------------------------------------------\n",
- "# small shell helpers (for curl/tar probing)\n",
- "# -------------------------------------------------\n",
- "def sh(cmd: str) -> int:\n",
- " return subprocess.call(cmd, shell=True)\n",
- "\n",
- "def curl(url: str, out: Path) -> int:\n",
- " # -L follow, -s silent, --fail to get nonzero on 404\n",
- " return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n",
- "\n",
- "def write_wav(dst: Path, data: np.ndarray, sr: int):\n",
- " x = np.clip(data, -1.0, 1.0)\n",
- " scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n",
- "\n",
- "# -----------------------------\n",
- "# MIT RIR (resample to 16 kHz)\n",
- "# -----------------------------\n",
- "print(\"=== MIT RIR ===\")\n",
- "rir_out = Path(\"mit_rirs\")\n",
- "rir_out.mkdir(exist_ok=True)\n",
- "if not any(rir_out.rglob(\"*.wav\")):\n",
- " ok = 0\n",
- " try:\n",
- " # Avoid datasets.Audio to keep TorchCodec out:\n",
- " # Use streaming=True + manual decode with librosa\n",
- " print(\"ā¬ļø MIT RIR (streaming + manual decode)ā¦\")\n",
- " ds = load_dataset(\n",
- " \"davidscripka/MIT_environmental_impulse_responses\",\n",
- " split=\"train\",\n",
- " streaming=True\n",
- " )\n",
- " for i, row in enumerate(tqdm(ds)):\n",
- " try:\n",
- " audio_path = row[\"audio\"][\"path\"]\n",
- " y, sr = librosa.load(audio_path, sr=16000, mono=True)\n",
- " write_wav(rir_out / f\"rir_{i:04d}.wav\", y, 16000)\n",
- " ok += 1\n",
- " except Exception:\n",
- " pass\n",
- " print(f\"ā MIT RIR saved: {ok} files\")\n",
- " except Exception as e:\n",
- " print(f\"ā ļø MIT RIR download failed: {e}\")\n",
- " # Fallback ZIP route\n",
- " try:\n",
- " print(\"ā¬ļø MIT RIR (fallback ZIP)ā¦\")\n",
- " zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n",
- " zip_path = rir_out.parent / \"MIT_RIR_Audio.zip\"\n",
- " if not zip_path.exists():\n",
- " os.system(f\"wget -q -O '{zip_path}' '{zip_url}'\")\n",
- " os.system(f'unzip -q -o \"{zip_path}\" -d \"{rir_out}\"')\n",
- " # Normalize to 16k mono\n",
- " for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n",
- " a, sr = sf.read(p, always_2d=False)\n",
- " if a.ndim > 1:\n",
- " a = a[:, 0]\n",
- " if sr != 16000:\n",
- " a, _ = librosa.load(p, sr=16000, mono=True)\n",
- " write_wav(p, a, 16000)\n",
- " print(\"ā MIT RIR fallback complete\")\n",
- " except Exception as e2:\n",
- " print(f\"ā MIT RIR fallback failed: {e2}\")\n",
- "else:\n",
- " print(\"ā mit_rirs exists; skipping.\")\n",
- "\n",
- "# ============================================================\n",
- "# AudioSet (pinned FLAC .tar ā 16k mono, skip bad files)\n",
- "# ============================================================\n",
- "print(\"\\n=== AudioSet subset (pinned FLAC .tar ā 16k mono) ===\")\n",
- "audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n",
- "audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n",
- "\n",
- "if any(audioset_out.rglob(\"*.wav\")):\n",
- " print(\"ā audioset_16k exists; skipping.\")\n",
- "else:\n",
- " # commits / refs we know about ā weāll probe them\n",
- " REV_CANDIDATES = [\n",
- " \"6762f044d1c88619c7f2006486036192128fb07e\",\n",
- " \"0049167e89f259a010c3f070fe3666d9e5242836\",\n",
- " \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n",
- " \"main\", # last resort\n",
- " ]\n",
- " # possible folder layouts\n",
- " TAR_PATTERNS = [\n",
- " \"data/bal_train0{idx}.tar\",\n",
- " \"data/bal_train/bal_train0{idx}.tar\",\n",
- " ]\n",
- "\n",
- " def find_working_rev():\n",
- " for rev in REV_CANDIDATES:\n",
- " for pat in TAR_PATTERNS:\n",
- " probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n",
- " rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n",
- " if rc == 0:\n",
- " return rev, pat\n",
- " return None, None\n",
- "\n",
- " rev, pattern = find_working_rev()\n",
- " if rev is None:\n",
- " raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n",
- "\n",
- " print(f\"š Using AudioSet revision: {rev}\")\n",
- " print(f\"šļø Tar layout pattern: {pattern}\")\n",
- "\n",
- " # download + extract bal_train00..09\n",
- " for i in range(10):\n",
- " rel = pattern.format(idx=i)\n",
- " url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n",
- " fname = rel.split(\"/\")[-1]\n",
- " out_tar = audioset_dir / fname\n",
- " if not out_tar.exists():\n",
- " print(f\"ā¬ļø {fname}\")\n",
- " rc = curl(url, out_tar)\n",
- " if rc != 0:\n",
- " print(f\"ā ļø Could not fetch {fname} at rev {rev}; continuing.\")\n",
- " continue\n",
- " print(f\"š¦ Extract {fname}\")\n",
- " rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n",
- " if rc != 0:\n",
- " print(f\"ā ļø tar extract failed for {fname}; continuing.\")\n",
- "\n",
- " # convert FLAC ā 16k mono WAV\n",
- " flacs = list(audioset_dir.rglob(\"*.flac\"))\n",
- " print(f\"š FLAC files: {len(flacs)}\")\n",
- " audioset_bad = []\n",
- " ok = 0\n",
- " for p in tqdm(flacs, desc=\"AudioSetāWAV (resample 16k mono)\"):\n",
- " try:\n",
- " y, _ = librosa.load(p, sr=16000, mono=True)\n",
- " if y.size == 0:\n",
- " raise ValueError(\"empty audio\")\n",
- " write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n",
- " ok += 1\n",
- " except Exception as e:\n",
- " audioset_bad.append(f\"{p}:{e}\")\n",
- "\n",
- " if audioset_bad:\n",
- " (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n",
- " print(f\"ā AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n",
- "\n",
- "# -----------------------------\n",
- "# FMA xsmall (resample to 16 kHz mono)\n",
- "# -----------------------------\n",
- "print(\"\\n=== FMA xsmall ===\")\n",
- "fma_zip_dir = Path(\"fma\"); fma_zip_dir.mkdir(exist_ok=True)\n",
- "fma_out = Path(\"fma_16k\"); fma_out.mkdir(exist_ok=True)\n",
- "\n",
- "zipname = \"fma_xs.zip\"\n",
- "zipurl = f\"https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/{zipname}\"\n",
- "zipout = fma_zip_dir / zipname\n",
- "if not zipout.exists():\n",
- " os.system(f\"wget -q -O '{zipout}' '{zipurl}'\")\n",
- " os.system(f\"cd fma && unzip -q '{zipname}'\")\n",
- "\n",
- "mp3s = list(Path(\"fma/fma_small\").rglob(\"*.mp3\"))\n",
- "print(f\"šµ FMA mp3 count: {len(mp3s)}\")\n",
- "corrupt = []\n",
- "for p in tqdm(mp3s, desc=\"FMAā16k WAV\"):\n",
- " try:\n",
- " y, sr = librosa.load(p, sr=16000, mono=True)\n",
- " if y.size == 0:\n",
- " raise ValueError(\"empty audio\")\n",
- " write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n",
- " except Exception as e:\n",
- " corrupt.append(f\"{p}:{e}\")\n",
- "if corrupt:\n",
- " Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n",
- "\n",
- "print(\"\\nā Dataset prep complete!\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "XW3bmbI5-JAz"
- },
- "outputs": [],
- "source": [
- "# Sets up the augmentations.\n",
- "# To improve your model, experiment with these settings and use more sources of\n",
- "# background clips.\n",
- "import sys, os\n",
- "from pathlib import Path\n",
- "\n",
- "# try the common places weāve used\n",
- "candidates = [\n",
- " \"/data/microWakeWord\", # what the last install log showed\n",
- " \"/data/microwakeword\", # lowercase variant\n",
- " \"./microwakeword\", # local clone\n",
- " \"./microWakeWord\", # camel case\n",
- "]\n",
- "\n",
- "for base in candidates:\n",
- " if os.path.isdir(base):\n",
- " # add the repo root\n",
- " sys.path.insert(0, base)\n",
- " # add the actual package dir inside the repo\n",
- " if os.path.isdir(os.path.join(base, \"microwakeword\")):\n",
- " sys.path.insert(0, os.path.join(base, \"microwakeword\"))\n",
- " break\n",
- "from microwakeword.audio.augmentation import Augmentation\n",
- "from microwakeword.audio.clips import Clips\n",
- "from microwakeword.audio.spectrograms import SpectrogramGeneration\n",
- "\n",
- "def validate_directories(paths):\n",
- " for path in paths:\n",
- " if not os.path.exists(path):\n",
- " print(f\"Error: Directory {path} does not exist. Please ensure preprocessing is complete.\")\n",
- " return False\n",
- " return True\n",
- "\n",
- "# Paths to augmented data\n",
- "impulse_paths = ['mit_rirs']\n",
- "background_paths = ['fma_16k', 'audioset_16k']\n",
- "\n",
- "if not validate_directories(impulse_paths + background_paths):\n",
- " raise ValueError(\"One or more required directories are missing.\")\n",
- "\n",
- "# Process TTS generated samples (default)\n",
- "clips_tts = Clips(\n",
- " input_directory='./generated_samples',\n",
- " file_pattern='*.wav',\n",
- " max_clip_duration_s=5,\n",
- " remove_silence=True,\n",
- " random_split_seed=10,\n",
- " split_count=0.1,\n",
- ")\n",
- "\n",
- "# Process personal recordings if available (optional)\n",
- "clips_personal = None\n",
- "if os.path.exists(\"./personal_samples\") and any(Path(\"./personal_samples\").glob(\"*.wav\")):\n",
- " clips_personal = Clips(\n",
- " input_directory=\"./personal_samples\",\n",
- " file_pattern=\"*.wav\",\n",
- " max_clip_duration_s=5,\n",
- " remove_silence=True,\n",
- " random_split_seed=10,\n",
- " split_count=0.1,\n",
- " )\n",
- " print(\"ā Found personal samples, will create separate feature set\")\n",
- "\n",
- "augmenter = Augmentation(\n",
- " augmentation_duration_s=3.2,\n",
- " augmentation_probabilities={\n",
- " \"SevenBandParametricEQ\": 0.1,\n",
- " \"TanhDistortion\": 0.05,\n",
- " \"PitchShift\": 0.15,\n",
- " \"BandStopFilter\": 0.1,\n",
- " \"AddColorNoise\": 0.1,\n",
- " \"AddBackgroundNoise\": 0.7,\n",
- " \"Gain\": 0.8,\n",
- " \"RIR\": 0.7,\n",
- " },\n",
- " impulse_paths=impulse_paths,\n",
- " background_paths=background_paths,\n",
- " background_min_snr_db=5,\n",
- " background_max_snr_db=10,\n",
- " min_jitter_s=0.2,\n",
- " max_jitter_s=0.3,\n",
- ")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "V5UsJfKKD1k9"
- },
- "outputs": [],
- "source": [
- "# Augment a random generated-sample WAV and play it back (pass ndarray to augmenter)\n",
- "from pathlib import Path\n",
- "from IPython.display import Audio, display\n",
- "import numpy as np\n",
- "import soundfile as sf\n",
- "import librosa, random, glob\n",
- "\n",
- "output_dir = Path(\"./augmented_clips\")\n",
- "output_dir.mkdir(exist_ok=True)\n",
- "\n",
- "# 1) Pick a random WAV from the Piper outputs\n",
- "candidates = glob.glob(\"generated_samples/*.wav\")\n",
- "if not candidates:\n",
- " raise SystemExit(\"No files in generated_samples/. Run the TTS sample cell first.\")\n",
- "src_path = random.choice(candidates)\n",
- "\n",
- "# 2) Load as 16 kHz mono float32\n",
- "y, sr = librosa.load(src_path, sr=16000, mono=True)\n",
- "y = y.astype(np.float32, copy=False)\n",
- "\n",
- "# 3) Augment ā microwakeword Augmentation expects a 1-D numpy array\n",
- "try:\n",
- " y_aug = augmenter.augment_clip(y)\n",
- "except Exception as e:\n",
- " # some versions accept (samples, sr) ā try that as a fallback\n",
- " try:\n",
- " y_aug = augmenter.augment_clip((y, sr))\n",
- " except Exception:\n",
- " raise\n",
- "\n",
- "# 4) Save and play\n",
- "out_path = output_dir / \"augmented_clip.wav\"\n",
- "sf.write(str(out_path), y_aug.astype(np.float32, copy=False), sr, subtype=\"PCM_16\")\n",
- "print(f\"Augmented clip saved to {out_path}\")\n",
- "display(Audio(str(out_path), autoplay=True))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "D7BHcY1mEGbK"
- },
- "outputs": [],
- "source": [
- "# Augment samples and save the training, validation, and testing sets.\n",
- "# This version avoids datasets.Audio entirely by driving Clips from local WAVs.\n",
- "\n",
- "import os, glob, random\n",
- "from pathlib import Path\n",
- "import types\n",
- "import numpy as np\n",
- "import librosa\n",
- "from mmap_ninja.ragged import RaggedMmap\n",
- "from microwakeword.audio.spectrograms import SpectrogramGeneration\n",
- "\n",
- "# ---- Patch: drive clips from generated_samples/*.wav (no datasets.Audio, no torchcodec) ----\n",
- "def audio_generator_from_wavs(self, split=\"train\", repeat=1, source_dir=\"generated_samples\"):\n",
- " \"\"\"\n",
- " Yield 1-D float32 arrays loaded via librosa from source_dir/*.wav.\n",
- " Deterministic 80/10/10 split with seed 10 to mirror original Clips behavior.\n",
- " \"\"\"\n",
- " files = sorted(glob.glob(f\"{source_dir}/*.wav\"))\n",
- " if not files:\n",
- " raise SystemExit(f\"ā No WAVs in {source_dir}/. Generate samples first.\")\n",
- "\n",
- " rng = random.Random(10) # deterministic shuffling like Clips(random_split_seed=10)\n",
- " files_shuf = files[:]\n",
- " rng.shuffle(files_shuf)\n",
- "\n",
- " n = len(files_shuf)\n",
- " n_val = max(1, int(0.10 * n))\n",
- " n_test = max(1, int(0.10 * n))\n",
- " n_train = max(0, n - n_val - n_test)\n",
- " splits = {\n",
- " \"train\": files_shuf[:n_train],\n",
- " \"validation\": files_shuf[n_train:n_train + n_val],\n",
- " \"test\": files_shuf[n_train + n_val:],\n",
- " }\n",
- " file_list = splits.get(split, [])\n",
- " if not file_list:\n",
- " return # nothing to yield\n",
- "\n",
- " for _ in range(max(1, int(repeat))):\n",
- " for p in file_list:\n",
- " y, sr = librosa.load(p, sr=16000, mono=True)\n",
- " yield y.astype(np.float32, copy=False)\n",
- "\n",
- "# Bind the patched generator to clips_tts instance\n",
- "def audio_generator_tts(self, split=\"train\", repeat=1):\n",
- " return audio_generator_from_wavs(self, split, repeat, \"generated_samples\")\n",
- "\n",
- "clips_tts.audio_generator = types.MethodType(audio_generator_tts, clips_tts)\n",
- "print(\"ā Patched clips_tts.audio_generator to stream from generated_samples/*.wav (no torchcodec).\")\n",
- "\n",
- "# Bind the patched generator to clips_personal if it exists\n",
- "if clips_personal is not None:\n",
- " def audio_generator_personal(self, split=\"train\", repeat=1):\n",
- " return audio_generator_from_wavs(self, split, repeat, \"personal_samples\")\n",
- " clips_personal.audio_generator = types.MethodType(audio_generator_personal, clips_personal)\n",
- " print(\"ā Patched clips_personal.audio_generator to stream from personal_samples/*.wav (no torchcodec).\")\n",
- "\n",
- "# ---- Validate augmentation asset folders exist ----\n",
- "def validate(paths):\n",
- " for p in paths:\n",
- " if not Path(p).exists():\n",
- " raise SystemExit(f\"ā Missing directory: {p}. Run dataset prep first.\")\n",
- "\n",
- "impulse_paths = [\"mit_rirs\"]\n",
- "background_paths = [\"fma_16k\", \"audioset_16k\"]\n",
- "validate(impulse_paths + background_paths)\n",
- "\n",
- "# ---- Output root ----\n",
- "out_root = Path(\"generated_augmented_features\")\n",
- "out_root.mkdir(exist_ok=True)\n",
- "\n",
- "# ---- Split config (same as before) ----\n",
- "split_cfg = {\n",
- " \"training\": {\"name\": \"train\", \"repetition\": 2, \"slide_frames\": 10},\n",
- " \"validation\": {\"name\": \"validation\", \"repetition\": 1, \"slide_frames\": 10},\n",
- " \"testing\": {\"name\": \"test\", \"repetition\": 1, \"slide_frames\": 1},\n",
- "}\n",
- "\n",
- "# ---- Generate features for TTS samples ----\n",
- "for split, cfg in split_cfg.items():\n",
- " out_dir = out_root / split\n",
- " out_dir.mkdir(parents=True, exist_ok=True)\n",
- " print(f\"š§Ŗ Processing {split} (TTS) ā¦\")\n",
- "\n",
- " spectros = SpectrogramGeneration(\n",
- " clips=clips_tts, # now backed by our WAV loader\n",
- " augmenter=augmenter, # your existing augmenter\n",
- " slide_frames=cfg[\"slide_frames\"],\n",
- " step_ms=10,\n",
- " )\n",
- "\n",
- " RaggedMmap.from_generator(\n",
- " out_dir=str(out_dir / \"wakeword_mmap\"),\n",
- " sample_generator=spectros.spectrogram_generator(\n",
- " split=cfg[\"name\"], repeat=cfg[\"repetition\"]\n",
- " ),\n",
- " batch_size=100,\n",
- " verbose=True,\n",
- " )\n",
- "\n",
- "# ---- Generate features for personal samples if available ----\n",
- "if clips_personal is not None:\n",
- " out_root_personal = Path(\"personal_augmented_features\")\n",
- " out_root_personal.mkdir(exist_ok=True)\n",
- " for split, cfg in split_cfg.items():\n",
- " out_dir = out_root_personal / split\n",
- " out_dir.mkdir(parents=True, exist_ok=True)\n",
- " print(f\"š§Ŗ Processing {split} (personal) ā¦\")\n",
- " spectros = SpectrogramGeneration(\n",
- " clips=clips_personal,\n",
- " augmenter=augmenter,\n",
- " slide_frames=cfg[\"slide_frames\"],\n",
- " step_ms=10,\n",
- " )\n",
- " RaggedMmap.from_generator(\n",
- " out_dir=str(out_dir / \"wakeword_mmap\"),\n",
- " sample_generator=spectros.spectrogram_generator(split=cfg[\"name\"], repeat=cfg[\"repetition\"]),\n",
- " batch_size=100,\n",
- " verbose=True,\n",
- " )\n",
- "\n",
- "print(\"ā Features ready (generated_augmented_features/*/wakeword_mmap)\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "1pGuJDPyp3ax"
- },
- "outputs": [],
- "source": [
- "# Downloads pre-generated spectrogram features (made for microWakeWord in\n",
- "# particular) for various negative datasets. This can be slow!\n",
- "\n",
- "import os\n",
- "import requests\n",
- "import zipfile\n",
- "from pathlib import Path\n",
- "from tqdm import tqdm\n",
- "\n",
- "# Function to download a file with progress bar\n",
- "def download_file(url, output_path):\n",
- " response = requests.get(url, stream=True)\n",
- " total_size = int(response.headers.get('content-length', 0))\n",
- " with open(output_path, \"wb\") as f, tqdm(\n",
- " desc=f\"Downloading {output_path.name}\",\n",
- " total=total_size,\n",
- " unit=\"B\",\n",
- " unit_scale=True,\n",
- " unit_divisor=1024,\n",
- " ) as bar:\n",
- " for chunk in response.iter_content(chunk_size=1024):\n",
- " f.write(chunk)\n",
- " bar.update(len(chunk))\n",
- " print(f\"Downloaded: {output_path}\")\n",
- "\n",
- "# Function to extract ZIP files\n",
- "def extract_zip(zip_path, extract_to):\n",
- " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
- " zip_ref.extractall(extract_to)\n",
- " print(f\"Extracted: {zip_path} to {extract_to}\")\n",
- "\n",
- "# Directory for negative datasets\n",
- "output_dir = Path('./negative_datasets')\n",
- "output_dir.mkdir(exist_ok=True)\n",
- "\n",
- "# Negative dataset URLs\n",
- "link_root = \"https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main/\"\n",
- "filenames = ['dinner_party.zip', 'dinner_party_eval.zip', 'no_speech.zip', 'speech.zip']\n",
- "\n",
- "# Download and extract files\n",
- "for fname in filenames:\n",
- " link = link_root + fname\n",
- " zip_path = output_dir / fname\n",
- "\n",
- " # Download only if the file doesn't already exist\n",
- " if not zip_path.exists():\n",
- " try:\n",
- " download_file(link, zip_path)\n",
- " except Exception as e:\n",
- " print(f\"Error downloading {fname}: {e}\")\n",
- " continue\n",
- "\n",
- " # Extract the ZIP file\n",
- " try:\n",
- " extract_zip(zip_path, output_dir)\n",
- " except Exception as e:\n",
- " print(f\"Error extracting {fname}: {e}\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Ii1A14GsGVQT"
- },
- "outputs": [],
- "source": [
- "# --- Save a yaml config that controls the training process ---\n",
- "\n",
- "import os, sys, yaml\n",
- "from pathlib import Path\n",
- "\n",
- "config = {}\n",
- "\n",
- "config[\"window_step_ms\"] = 10\n",
- "config[\"train_dir\"] = \"trained_models/wakeword\"\n",
- "\n",
- "config[\"features\"] = [\n",
- " {\"features_dir\":\"generated_augmented_features\",\"sampling_weight\":2.0,\"penalty_weight\":1.0,\"truth\":True,\"truncation_strategy\":\"truncate_start\",\"type\":\"mmap\"},\n",
- " {\"features_dir\":\"negative_datasets/speech\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n",
- " {\"features_dir\":\"negative_datasets/dinner_party\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n",
- " {\"features_dir\":\"negative_datasets/no_speech\",\"sampling_weight\":5.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n",
- " {\"features_dir\":\"negative_datasets/dinner_party_eval\",\"sampling_weight\":0.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"split\",\"type\":\"mmap\"},\n",
- "]\n",
- "\n",
- "# Add personal features if they exist\n",
- "if os.path.exists(\"personal_augmented_features/training\"):\n",
- " config[\"features\"].insert(1, {\"features_dir\": \"personal_augmented_features\", \"sampling_weight\": 3.0, \"penalty_weight\": 1.0, \"truth\": True, \"truncation_strategy\": \"truncate_start\", \"type\": \"mmap\"})\n",
- " print(\"ā Added personal features with higher weight (3.0)\")\n",
- "\n",
- "config[\"training_steps\"] = [40000]\n",
- "config[\"positive_class_weight\"] = [1]\n",
- "config[\"negative_class_weight\"] = [20]\n",
- "config[\"learning_rates\"] = [0.001]\n",
- "\n",
- "# Smaller batch to avoid GPU copy/alloc failures on 3070 laptop VRAM\n",
- "config[\"batch_size\"] = 16\n",
- "\n",
- "# SpecAugment off (as before)\n",
- "config[\"time_mask_max_size\"] = [0]\n",
- "config[\"time_mask_count\"] = [0]\n",
- "config[\"freq_mask_max_size\"] = [0]\n",
- "config[\"freq_mask_count\"] = [0]\n",
- "\n",
- "config[\"eval_step_interval\"] = 500\n",
- "config[\"clip_duration_ms\"] = 1500\n",
- "config[\"target_minimization\"] = 0.9\n",
- "config[\"minimization_metric\"] = None\n",
- "config[\"maximization_metric\"] = \"average_viable_recall\"\n",
- "\n",
- "with open(\"training_parameters.yaml\", \"w\") as f:\n",
- " yaml.dump(config, f)\n",
- "\n",
- "print(\"ā Wrote training_parameters.yaml (batch_size=16)\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "WoEXJBaiC9mf"
- },
- "outputs": [],
- "source": [
- "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n",
- "# (LIVE streaming output + full log capture for error detection)\n",
- "# NOTE: Suppress ONLY the noisy \"Validation Batch #...\" lines (everything else still streams)\n",
- "import os, sys, subprocess, textwrap\n",
- "\n",
- "# ---- Common TF env (applies to BOTH attempts) ----\n",
- "base_env = os.environ.copy()\n",
- "base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n",
- "base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable mem)\n",
- "base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n",
- "\n",
- "# These only matter when a GPU is visible:\n",
- "base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n",
- "base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n",
- "# Optional (uncomment if you want a smaller cuDNN workspace):\n",
- "# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n",
- "\n",
- "# ---- Training argv (same as your runpy args) ----\n",
- "train_args = [\n",
- " \"-m\", \"microwakeword.model_train_eval\",\n",
- " \"--training_config\", \"training_parameters.yaml\",\n",
- " \"--train\", \"1\",\n",
- " \"--restore_checkpoint\", \"1\",\n",
- " \"--test_tf_nonstreaming\", \"0\",\n",
- " \"--test_tflite_nonstreaming\", \"0\",\n",
- " \"--test_tflite_nonstreaming_quantized\", \"0\",\n",
- " \"--test_tflite_streaming\", \"0\",\n",
- " \"--test_tflite_streaming_quantized\", \"1\",\n",
- " \"--use_weights\", \"best_weights\",\n",
- " \"mixednet\",\n",
- " \"--pointwise_filters\", \"64,64,64,64\",\n",
- " \"--repeat_in_block\", \"1,1,1,1\",\n",
- " \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n",
- " \"--residual_connection\", \"0,0,0,0\",\n",
- " \"--first_conv_filters\", \"32\",\n",
- " \"--first_conv_kernel_size\", \"5\",\n",
- " \"--stride\", \"2\",\n",
- "]\n",
- "\n",
- "OOM_MARKERS = (\n",
- " \"resourceexhaustederror\",\n",
- " \"resource exhausted\",\n",
- " \"oom\",\n",
- " \"out of memory\",\n",
- " \"cuda_error_out_of_memory\",\n",
- " \"cudnn\",\n",
- " \"failed to allocate\",\n",
- " \"blas xgemm\",\n",
- " \"cublas\",\n",
- " \"internalerror: cuda\",\n",
- " \"failed call to cuinit\",\n",
- ")\n",
- "\n",
- "class RunResult:\n",
- " def __init__(self, returncode: int, stdout: str):\n",
- " self.returncode = returncode\n",
- " self.stdout = stdout\n",
- "\n",
- "def run_training(label: str, extra_env: dict) -> RunResult:\n",
- " env = base_env.copy()\n",
- " env.update(extra_env or {})\n",
- "\n",
- " print(f\"\\nš {label}\")\n",
- " print(\"ā\", \" \".join([sys.executable] + train_args))\n",
- "\n",
- " proc = subprocess.Popen(\n",
- " [sys.executable] + train_args,\n",
- " env=env,\n",
- " text=True,\n",
- " stdout=subprocess.PIPE,\n",
- " stderr=subprocess.STDOUT,\n",
- " bufsize=1, # line-buffered (best effort)\n",
- " universal_newlines=True,\n",
- " )\n",
- "\n",
- " full_log = []\n",
- " try:\n",
- " # Stream lines live AND capture them for OOM detection / error messages\n",
- " assert proc.stdout is not None\n",
- " for line in proc.stdout:\n",
- " full_log.append(line)\n",
- "\n",
- " # Hide ONLY the per-minibatch validation spam\n",
- " if line.startswith(\"Validation Batch #\"):\n",
- " continue\n",
- "\n",
- " # Everything else streams live\n",
- " print(line, end=\"\")\n",
- " finally:\n",
- " returncode = proc.wait()\n",
- "\n",
- " return RunResult(returncode, \"\".join(full_log))\n",
- "\n",
- "# Attempt 1: GPU (normal visibility)\n",
- "cp = run_training(\n",
- " \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n",
- " extra_env={}, # no override\n",
- ")\n",
- "\n",
- "if cp.returncode == 0:\n",
- " print(\"ā Training and testing complete (GPU path).\")\n",
- "else:\n",
- " out_l = (cp.stdout or \"\").lower()\n",
- " looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n",
- "\n",
- " if looks_like_gpu_oom:\n",
- " # Attempt 2: CPU fallback (hide GPUs completely)\n",
- " cp2 = run_training(\n",
- " \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n",
- " extra_env={\n",
- " \"CUDA_VISIBLE_DEVICES\": \"\", # hard-disable GPU\n",
- " # (Optional) makes TF less chatty about GPU init on some builds:\n",
- " \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n",
- " },\n",
- " )\n",
- " if cp2.returncode == 0:\n",
- " print(\"ā Training and testing complete (CPU fallback).\")\n",
- " else:\n",
- " raise RuntimeError(\n",
- " \"Training failed on BOTH GPU and CPU.\\n\\n\"\n",
- " + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\" \")\n",
- " )\n",
- " else:\n",
- " # Not an OOM-style failure: surface the original error\n",
- " raise RuntimeError(\n",
- " \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n",
- " + textwrap.indent(cp.stdout or \"(no output)\", prefix=\" \")\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ex_UIWvwtjAN"
- },
- "outputs": [],
- "source": [
- "import shutil\n",
- "import json\n",
- "from IPython.display import display, HTML\n",
- "\n",
- "# Use the wake word from Cell 3\n",
- "wake_word = TARGET_WORD\n",
- "\n",
- "# --- Copy TFLite file to working dir with wake word name ---\n",
- "source_path = \"trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite\"\n",
- "tflite_filename = f\"{wake_word}.tflite\"\n",
- "tflite_path = f\"./{tflite_filename}\"\n",
- "shutil.copy(source_path, tflite_path)\n",
- "\n",
- "# --- Write JSON metadata file with matching model name ---\n",
- "json_data = {\n",
- " \"type\": \"micro\",\n",
- " \"wake_word\": wake_word,\n",
- " \"author\": \"Tater Totterson\",\n",
- " \"website\": \"https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git\",\n",
- " \"model\": tflite_filename,\n",
- " \"trained_languages\": [\"en\"],\n",
- " \"version\": 2,\n",
- " \"micro\": {\n",
- " \"probability_cutoff\": 0.97,\n",
- " \"sliding_window_size\": 5,\n",
- " \"feature_step_size\": 10,\n",
- " \"tensor_arena_size\": 30000,\n",
- " \"minimum_esphome_version\": \"2024.7.0\"\n",
- " }\n",
- "}\n",
- "json_filename = f\"{wake_word}.json\"\n",
- "json_path = f\"./{json_filename}\"\n",
- "with open(json_path, \"w\") as json_file:\n",
- " json.dump(json_data, json_file, indent=2)\n",
- "\n",
- "# --- Display nice download links ---\n",
- "html = f\"\"\"\n",
- "