diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..545370a Binary files /dev/null and b/.DS_Store differ diff --git a/cli/.bashrc b/.bashrc similarity index 100% rename from cli/.bashrc rename to .bashrc diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/README.md b/README.md index 75eefd8..359d73d 100644 --- a/README.md +++ b/README.md @@ -1,123 +1,507 @@ -
- MicroWakeWord Trainer Logo -

microWakeWord Trainer Docker

-
+# Run training from the command line -# šŸ„” MicroWakeWord Trainer – Tater Approved +## Overview -**āœ… Tater Totterson tested & working on an NVIDIA RTX 3070 Laptop GPU (8 GB VRAM).** -Easily train microWakeWord detection models with this pre-built Docker image and JupyterLab notebook. +With these scripts and Dockerfile, you can train new wake words from the +command line without using a Jupyter notebook. ---- +Differences between this Docker image and the Jupyter notebook image: -## šŸš€ Quick Start +* The Python training environment isn't included in the image. Instead, a + "virtual environment" (venv) is created in the `/data` directory which you + will have mounted to a host directory. This cuts about 7gb from the image + and allows the virtualenv to persist across container instances. -Follow these steps to get up and running: +* The logic from the Jupyter notebook is contained in individual Python + and shell scripts -### 1ļøāƒ£ Pull the Pre-Built Docker Image +* No ports need to be exposed since the Jupyter notebook server isn't being + run. -```bash -docker pull ghcr.io/tatertotterson/microwakeword:latest +## TL;DR + +For the impatient among you... + +```shell +$ mkdir /some/work/directory # On a device with more than 150GB free space +$ docker build -t microwakeword-cli:latest . +$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest +root@mww-cli:/# cd /data +root@mww-cli:/data# setup_python_venv +##### You have about 4 minutes to drink coffee + +root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files +##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection) + +root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word" +##### You have about 30-45 minutes for a nap depending on available system resources. +##### You'll be informed of where to find your trained model. ``` ---- +Load the trained model on your device and give it a try but don't be surprized +if you get a lot of missed or false activations. Read on to find out why. -### 2ļøāƒ£ Run the Docker Container +## Get Started -```bash -docker run --rm -it \ - --gpus all \ - -p 8888:8888 \ - -v $(pwd):/data \ - ghcr.io/tatertotterson/microwakeword:latest +Good, you stuck around! Now read the rest of the document before doing +anything. + +### Using a GPU + +Having an Nvidia GPU available can cut the training time by up to half. The +open-source nouveau driver shipped with Linux kernels doesn't support CUDA +however so if you have an Nvidia GPU and want to use it for training, you'll +need to install the official Nvidia driver from +https://www.nvidia.com/en-in/drivers/unix/ + +### Build the image + +You can use either Docker or Podman as your container management tool. +`docker` is used in the examples but if you have podman, just substitute +the command. + +Start by navigating to the directory that contains this README file and +the accompanying Dockerfile. Then... + + +```shell +docker build -t microwakeword-cli:latest . ``` -**What these flags do:** -- `--gpus all` → Enables GPU acceleration -- `-p 8888:8888` → Exposes JupyterLab on port 8888 -- `-v $(pwd):/data` → Saves your work in the current folder +This should be fairly quick and result in an image that's about 320mb in size +as it's basically a standard Ubunbtu24.04 image with a few added tools. ---- +So why isn't a pre-built image available for download? Because it'll probably +take longer to download a pre-built image than for you to create it locally. +GitHub's container registry is notoriously erratic when it comes to download +throughput. -### 3ļøāƒ£ Open JupyterLab +### Create a host work directory -Visit [http://localhost:8888](http://localhost:8888) in your browser — the notebook UI will open. +This directory will contain the Python virtual environment plus all of the +downloaded and generated data needed for training and the final trained +models. A full environment will need about 150gb of free space but read +further to see how to reduce this. ---- +Your `` will be mounted inside the container as `/data`. -### 4ļøāƒ£ Set Your Wake Word +The training container will start a Bash shell so if you have Bash +aliases or Bashy things you like, create a `.bashrc` file in your +`` and put them in there. It'll automatically be included +any time you enter the container. -At the **top of the notebook**, find this line: +### Create and start the container -```bash -TARGET_WORD = "hey_tater" # Change this to your desired wake word +There are lots of options that control container creation. The simplest example +will create the container and give you an interactive shell. When you exit the +shell, the container will be stopped and removed leaving your `` +intact. + +```shell +$ docker run -it --rm --gpus=all -v :/data microwakeword-cli:latest ``` -Change `"hey_tater"` to your desired wake word (phonetic spellings often work best). +Options: ---- +* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it. +* Remove the `--rm` and add a `--name=mww-cli` option to keep the container + around and give it a name for training more than one wake word. You + can stop and remove it when you're ready. +* Add a `-d` option to start the container in the background and use `docker + attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it. -### 5ļøāƒ£ Run the Notebook +When the container starts, you'll see: -Run all cells in the notebook. This process will: -- Generate wake word samples -- Train a detection model -- Output a quantized `.tflite` model ready for on-device use +```text +======================================================= +WARNING: A python virtual environment wasn't found +at /data/.venv. You'll need to run setup_python_venv +before you'll be able to use this container for +training. +======================================================= +root@mww-cli:/# +``` ---- +Don't worry about the python WARNING right now. You'll be creating the +virtualenv in the next step. -### 6ļøāƒ£ Retrieve the Trained Model & JSON +If you've forgotton to create and/or mount your host data directory, you'll +see an additional warning: -When training finishes, download links for both the `.tflite` model and its `.json` manifest will be displayed in the last cell. +```text +======================================================= +WARNING: The /data directory is NOT mounted. +Running the training process without /data mounted +could add over 140Gb of python packages and training +files to this container's storage which is probably +NOT what you want. ---- +You should remove this container and re-create it with +a 'docker run' option like '-v :/data' +making sure the host directory is on a device that has +enough free space. +======================================================= +``` -## šŸ”„ Resetting to a Clean State +You can certainly continue but it's a "really bad idea"ā„¢ because your +container storage could grow from a few hundred mb to over 140gb. -If you need to start fresh: +At this point, you're in a Bash shell. -1. Delete the `data` folder that was mapped to your Docker container. -2. Restart the container using the steps above. -3. A fresh copy of the notebook will be placed into the `data` directory. +### Create the Python virtual environment ---- +The Python virtual environment will contain all the software needed to train. +It gets created as `/data/.venv` and will take up about 11gb of disk space. -## šŸŽ¤ Optional: Personal Voice Samples +The scripts that do all the work will be in the container's PATH so to setup +the virtual environment and install all of the packages, just run: -In addition to synthetic TTS samples, the trainer can optionally use your own real voice recordings to significantly improve accuracy for your voice and environment. +```text +setup_python_venv [ --verbose ] -### How it works -- If a folder named personal_samples/ exists and contains .wav files, the trainer will: - - Automatically extract features from those recordings - - Include them during training alongside the synthetic TTS data - - Up-weight your personal samples during training for better real-world performance +Options: -No extra flags or configuration are required — it is detected automatically. +--verbose: Print the detailed "pip install" output. -### How to use it -1. Create a folder in the repo root: - mkdir personal_samples +``` -2. Record yourself saying the wake word naturally and save the files as .wav: - personal_samples/ - hey_tater_01.wav - hey_tater_02.wav - hey_tater_03.wav - ... +When the installation is finished, a test of the major components will be +run. + +Once the process is done, you should change to the `/data` directory and +activate the virtual environment with: + +```shell +root@mww-cli:/# cd /data +root@mww-cli:/data# source .venv/bin/activate +(.venv) root@mww-cli:/data# +``` + +Technically, you don't need to do either of these since the scripts +are in the PATH and they know to use the `/data` directory for everything. +It's more of an "if you're interested" thing. + +At this point, you have a container with all software installed. + +## Get the reference data + +The training process itself relies on a significant amount of audio reference +data that creates a simulated "audio environment" that your wake word will be +trained in. These "training datasets" include things like varying amounts of +reverberation, background music, background conversations, background noise, +etc. All said and done, it amounts to about 30gb of audio but with the +downloaded archives and extracted intermediate files, you'll need about 85gb +of free space. Thankfully, you only need to download the files once no +matter how many wake words you want to train and since it's stored in +`/data`, you can even remove the docker container and recreate it without +losing any of it. There are 4 datasets that are required. + +This is a three stage process... + +1. Download zipfiles or tarballs. (about 30gb) +2. Extract them. (about 50gb) +3. Convert them into the final form. (about 31gb) + +NOTE: The sizes add up to more than the 85gb stated earlier because one +of the datasets doesn't need to be covnerted and is counted in both +steps 2 and 3. You really do only need 85gb. + +To download the archives, unpack them, and convert the audio to what's needed +by the training process, run: + +```text +setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ] + +Options: +--cleanup-archives: Automatically delete the tarballs or zipfiles after + they've been extracted. + +--cleanup-intermediate-files: Automatically delete the intermediate files + after they've been converted. + +``` + +On a 1gb/sec Internet connection, this will take about 25 minutes. + +The script detects if the datasets have already been downloaded, extracted +and/or converted and skips those steps as appropriate so if you've run the +script without the cleanup options, you can just run it again with those +options to clean them up. + +Now you're ready to train a wake word. Almost. + +## Train a Wake Word + +Training is done in 3 stages. + +1. Generate thousands of samples of the wake word with various voices, +pitches, speeds, inflections, etc. +2. Augment the samples with the training datasets to add background noise, etc. +3. Run the Tensorflow training. + +### Generate a sample for verification + +Before you start the full process, you're going to want to generate a single +wake word sample and play it back to ensure it sounds right. The wake word +should be spelled phonetically to give the sample generator the best chance +of success. + +```text +root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster" +===== Generating 1 sample of 'hey buster' ===== + Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt + Successfully loaded the model + Batch 1/0 complete + Done +Sample available at /data/work/test_sample/hey_buster.wav +Play it from your host. +``` + +You should then play that file from your host. The reason I used "hey buster" +as the wake word is to demonstrate why it's important to generate and listen +to a sample. If you try that exact input and play it back, you'll notice +that the generator didn't capture the "er" at the end very well. To get it to +do so, I had to add a period on the end as a "spacer". +"hey buster." worked much better. + +When you're happy with the sample, you can run the full process. + +### Run the full training process + +```text +train_wake_word [ --samples= ] [ --batch-size= ] + [ --training-steps= ] [ --cleanup-work-dir ] + [ ] + +Options: +--samples: The number of samples to generate for the wake word. + Default: 20000 + +--batch-size: How many samples should be generated at a time. The more + samples, the more memory is needed. + Default: 100 + +--training-steps: Number of training steps. More training steps means better + detection and false positive rates but also more time to train. + Default: 25000 + +--cleanup-work-dir: Delete the /data/work directory after successful training. + Default: false + + The word to train spelled phonetically. + Required. + + An optional pretty name to save to the json metadata file. + Default: The wake word with individual words capitalized + and punctuation removed. + +``` + +By default, the training process creates 20,000 samples of your wake word and +runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results) +in the [Extra Credit](#extra-credit) section below for +why these are the defaults. Depending on resources available, this could take +between 30 and 60 minutes. + +The resulting tflite model files and logs will be placed in the +`/data/output/---` directory +and will therefore be available from your host in the directory you mapped +`/data` to. File names will have non-filename-friendly characters in your +wake word changed to underscores to make things easier. You'll need both the +tflite and json files to load on your device. Exactly how you load them +depends on the device and is beyond the scope of this project. + +The only real measure of success is how well the resulting model works +on a real device. If you encounter too many missed or false activations, +increasing the number of samples would probably improve the results more +than increasing the number of training steps. See +[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below. + +The output from the last step is filtered some by the script but still quite +verbose. The full log will be available in the output directory as +`training.log` if you're interested. Intepreting the log is beyond the scope +of this project however. + +You can train additional wake words or change the number of samples and +training steps by simply running `train_wake_word` again. No need to repeat +any of the earlier setup steps. If you change the wake word or the number of +wake word samples, the work directory will be deleted and all 3 steps re-run. +If you only change the number of training steps, the data from the first two +steps is still valid and only the 3rd step is run. + +All of the intermediate data is stored in the `/data/work` directory which will +grow to about 17gb with 20,000 wake word samples. Once the tflite model is +successfully generated and you're happy with the results, you can delete the +`/data/work` directory. + +### Training more than one wake word + +Once you have a container running, you +can easily train multiple wake words from your host: + +```shell +for wp in "hey_alexa" "hey_jenkins" ; do + docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp" +done +``` + +### Training time examples + +Training times depend on lots of things. These are examples only. +Your Mileage May Vary!!! + +```text +=============================================================================== + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 10000 samples, 100/batch Elapsed time: 0:06:17 + Augment 10000 samples Elapsed time: 0:04:05 + 10000 training steps Elapsed time: 0:15:04 + ================================================== + Total Elapsed time: 0:25:26 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 10000 samples, 100/batch Elapsed time: 0:00:29 + Augment 10000 samples Elapsed time: 0:03:40 + 10000 training steps Elapsed time: 0:08:00 + ====================================================== + Total Elapsed time: 0:12:09 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 20000 samples, 100/batch Elapsed time: 0:10:38 + Augment 20000 samples Elapsed time: 0:07:04 + 25000 training steps Elapsed time: 0:25:21 + ====================================================== + Total Elapsed time: 0:43:03 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 20000 samples, 100/batch Elapsed time: 0:00:53 + Augment 20000 samples Elapsed time: 0:07:05 + 25000 training steps Elapsed time: 0:19:13 + ====================================================== + Total Elapsed time: 0:27:11 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: N/A + + Generate 50000 samples, 100/batch Elapsed time: 0:30:47 + Augment 50000 samples Elapsed time: 0:20:22 + 40000 training steps Elapsed time: 1:01:51 + ================================================== + Total Elapsed time: 1:53:00 +================================================================================ + +================================================================================ + Training Summary + +CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb +GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb + + Generate 50000 samples, 100/batch Elapsed time: 0:02:08 + Augment 50000 samples Elapsed time: 0:19:13 + 40000 training steps Elapsed time: 0:42:23 + ====================================================== + Total Elapsed time: 1:03:44 +================================================================================ + + +``` + +The sample generation process is really the only one that uses multiple CPUs so +having fewer CPU threads available will probably make little difference. + +## Extra Credit + +### Training defaults + +If you plan on training multiple wake words, you can set your own default +training parameters by creating a `/data/.defaults.env` file with the +following contents: + +```shell +# Variable names follow the command line parameters converted to upper case +# and with the dashes ('-') converted to underscores ('_'). +export SAMPLES=10000 +export TRAINING_STEPS=10000 + +# Don't use the GPU for any operations. Stick with the CPU only. +##export CUDA_VISIBLE_DEVICES=-1 + +``` + +### Examine your model with Tensorboard + +Tensorboard is a web-based graphical model viewer. You can use it to get an +idea of how many training steps are needed before accuracy results stop +improving. To use it, you'll have to expose port 6006 by adding `-p +6006:6006` to your `docker run` command line. If you didn't, don't worry. +Remember, the /data directory is mapped to a directory on your host so you +can simply stop and delete the current container and recreate it with the new +`docker run` command. No need to re-run any of the setup or training steps. + +To start Tensorboard, run: + +```shell +root@mww-cli:/# cd /data +root@mww-cli:/data# source .venv/bin/activate +(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output +``` + +Now on your host, point your browser at `http://localhost:6006/`, +click "SCALARS" at the top and take a look at the various charts. You'll see +a "train" and "validation" item for each training run you've performed. It's +the "train" items you're interested in. + + + +You have to be a Tensorflow expert to decipher most of the charts but +the "Accuracy" chart for this particular wake word and 50,000 samples would +seem to idicate that there's very little improvement after about 20,000 +training steps. + +![Accuracy Chart, 50000 samples](tensorboard1.png) + +In contrast, with only 5,000 wake word samples, there's still improvement to be had after +20,000 training steps. + +![Accuracy Chart, 5000 samples](tensorboard2.png) + +Given that it's faster to generate wake word samples than it is to train, +20,000 samples and 25,000 training steps seems like a good compromise. This +chart has a bit less smoothing to show a bit more detail and includes the +50,000 sample run as well. This run took only 27 minutes as opposed to the +63 minutes it took for the 50,000 sample run. Now you know why 20,000 and +25,000 are the defaults for these scripts. + +![Accuracy Chart, 25000 samples](tensorboard3.png) -3. Run the training script as normal: -If personal samples are found, you’ll see a message during training indicating they are being included. -### Recording tips -- 10–30 recordings is usually enough to see a noticeable improvement -- Vary distance, volume, and tone slightly -- Record in the same environment where the wake word will be used (room noise matters) -- Use 16-bit WAV files if possible (most recorders do this by default) ---- -## šŸ™Œ Credits -This project builds upon the excellent work of [kahrendt/microWakeWord](https://github.com/kahrendt/microWakeWord). -Huge thanks to the original authors for their contributions to the open-source community! diff --git a/cli/.DS_Store b/cli/.DS_Store new file mode 100644 index 0000000..81f16e0 Binary files /dev/null and b/cli/.DS_Store differ diff --git a/cli/Dockerfile b/cli/Dockerfile deleted file mode 100644 index c460d93..0000000 --- a/cli/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -# Since this is a pure python environment, we don't need to start -# with a huge CUDA image. A standard Ubuntu image will do. -FROM ubuntu:24.04 - -ENV DEBIAN_FRONTEND=noninteractive \ - PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 \ - PIP_ROOT_USER_ACTION=ignore \ - HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ - PATH="/root/mww-scripts:${PATH}" - -# System deps -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \ - git wget curl unzip ca-certificates nano less \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /data - -COPY --chown=root:root --chmod=0755 .bashrc /root/ -COPY --chown=root:root --chmod=0755 setup_* wake_word_sample* train_wake_word \ - test_python cudainfo system_summary shell.functions requirements.txt /root/mww-scripts/ - -# Docker and Podman send the CMD a SIGTERM when you "stop" the container. Unfortunately, bash -# normally doesn't exit when it recieves a SIGTERM so docker/podman has to wait for the "stop" -# to timeout then SIGKILL the container. -# This little scriptlet causes bash to exit immediately when it receives the SIGTERM. -CMD ["/usr/bin/bash", "-c", "exec /usr/bin/bash --rcfile <(echo '[ -f ~/.bashrc ] && source ~/.bashrc ; trap exit SIGTERM ;')" ] diff --git a/cli/README.md b/cli/README.md deleted file mode 100644 index 359d73d..0000000 --- a/cli/README.md +++ /dev/null @@ -1,507 +0,0 @@ -# Run training from the command line - -## Overview - -With these scripts and Dockerfile, you can train new wake words from the -command line without using a Jupyter notebook. - -Differences between this Docker image and the Jupyter notebook image: - -* The Python training environment isn't included in the image. Instead, a - "virtual environment" (venv) is created in the `/data` directory which you - will have mounted to a host directory. This cuts about 7gb from the image - and allows the virtualenv to persist across container instances. - -* The logic from the Jupyter notebook is contained in individual Python - and shell scripts - -* No ports need to be exposed since the Jupyter notebook server isn't being - run. - -## TL;DR - -For the impatient among you... - -```shell -$ mkdir /some/work/directory # On a device with more than 150GB free space -$ docker build -t microwakeword-cli:latest . -$ docker run -it --rm --gpus=all -v /some/work/directory:/data --name=mww-cli microwakeword-cli:latest -root@mww-cli:/# cd /data -root@mww-cli:/data# setup_python_venv -##### You have about 4 minutes to drink coffee - -root@mww-cli:/data# setup_training_datasets --cleanup-archives --cleanup-intermediate-files -##### You have about 25 minutes for a quick lunch (on a 1gb/sec internet connection) - -root@mww-cli:/data# train_wake_word --cleanup-work-dir "wake_word" "Wake Word" -##### You have about 30-45 minutes for a nap depending on available system resources. -##### You'll be informed of where to find your trained model. -``` - -Load the trained model on your device and give it a try but don't be surprized -if you get a lot of missed or false activations. Read on to find out why. - -## Get Started - -Good, you stuck around! Now read the rest of the document before doing -anything. - -### Using a GPU - -Having an Nvidia GPU available can cut the training time by up to half. The -open-source nouveau driver shipped with Linux kernels doesn't support CUDA -however so if you have an Nvidia GPU and want to use it for training, you'll -need to install the official Nvidia driver from -https://www.nvidia.com/en-in/drivers/unix/ - -### Build the image - -You can use either Docker or Podman as your container management tool. -`docker` is used in the examples but if you have podman, just substitute -the command. - -Start by navigating to the directory that contains this README file and -the accompanying Dockerfile. Then... - - -```shell -docker build -t microwakeword-cli:latest . -``` - -This should be fairly quick and result in an image that's about 320mb in size -as it's basically a standard Ubunbtu24.04 image with a few added tools. - -So why isn't a pre-built image available for download? Because it'll probably -take longer to download a pre-built image than for you to create it locally. -GitHub's container registry is notoriously erratic when it comes to download -throughput. - -### Create a host work directory - -This directory will contain the Python virtual environment plus all of the -downloaded and generated data needed for training and the final trained -models. A full environment will need about 150gb of free space but read -further to see how to reduce this. - -Your `` will be mounted inside the container as `/data`. - -The training container will start a Bash shell so if you have Bash -aliases or Bashy things you like, create a `.bashrc` file in your -`` and put them in there. It'll automatically be included -any time you enter the container. - -### Create and start the container - -There are lots of options that control container creation. The simplest example -will create the container and give you an interactive shell. When you exit the -shell, the container will be stopped and removed leaving your `` -intact. - -```shell -$ docker run -it --rm --gpus=all -v :/data microwakeword-cli:latest -``` - -Options: - -* Remove the `--gpus=all` option if you don't have an Nvidia GPU or don't want to use it. -* Remove the `--rm` and add a `--name=mww-cli` option to keep the container - around and give it a name for training more than one wake word. You - can stop and remove it when you're ready. -* Add a `-d` option to start the container in the background and use `docker - attach mww-cli` or `docker exec -it mww-cli /bin/bash` to connect to it. - -When the container starts, you'll see: - -```text -======================================================= -WARNING: A python virtual environment wasn't found -at /data/.venv. You'll need to run setup_python_venv -before you'll be able to use this container for -training. -======================================================= -root@mww-cli:/# -``` - -Don't worry about the python WARNING right now. You'll be creating the -virtualenv in the next step. - -If you've forgotton to create and/or mount your host data directory, you'll -see an additional warning: - -```text -======================================================= -WARNING: The /data directory is NOT mounted. -Running the training process without /data mounted -could add over 140Gb of python packages and training -files to this container's storage which is probably -NOT what you want. - -You should remove this container and re-create it with -a 'docker run' option like '-v :/data' -making sure the host directory is on a device that has -enough free space. -======================================================= -``` - -You can certainly continue but it's a "really bad idea"ā„¢ because your -container storage could grow from a few hundred mb to over 140gb. - -At this point, you're in a Bash shell. - -### Create the Python virtual environment - -The Python virtual environment will contain all the software needed to train. -It gets created as `/data/.venv` and will take up about 11gb of disk space. - -The scripts that do all the work will be in the container's PATH so to setup -the virtual environment and install all of the packages, just run: - -```text -setup_python_venv [ --verbose ] - -Options: - ---verbose: Print the detailed "pip install" output. - -``` - -When the installation is finished, a test of the major components will be -run. - -Once the process is done, you should change to the `/data` directory and -activate the virtual environment with: - -```shell -root@mww-cli:/# cd /data -root@mww-cli:/data# source .venv/bin/activate -(.venv) root@mww-cli:/data# -``` - -Technically, you don't need to do either of these since the scripts -are in the PATH and they know to use the `/data` directory for everything. -It's more of an "if you're interested" thing. - -At this point, you have a container with all software installed. - -## Get the reference data - -The training process itself relies on a significant amount of audio reference -data that creates a simulated "audio environment" that your wake word will be -trained in. These "training datasets" include things like varying amounts of -reverberation, background music, background conversations, background noise, -etc. All said and done, it amounts to about 30gb of audio but with the -downloaded archives and extracted intermediate files, you'll need about 85gb -of free space. Thankfully, you only need to download the files once no -matter how many wake words you want to train and since it's stored in -`/data`, you can even remove the docker container and recreate it without -losing any of it. There are 4 datasets that are required. - -This is a three stage process... - -1. Download zipfiles or tarballs. (about 30gb) -2. Extract them. (about 50gb) -3. Convert them into the final form. (about 31gb) - -NOTE: The sizes add up to more than the 85gb stated earlier because one -of the datasets doesn't need to be covnerted and is counted in both -steps 2 and 3. You really do only need 85gb. - -To download the archives, unpack them, and convert the audio to what's needed -by the training process, run: - -```text -setup_training_datasets [ --cleanup-archives ] [ --cleanup-intermediate-files ] - -Options: ---cleanup-archives: Automatically delete the tarballs or zipfiles after - they've been extracted. - ---cleanup-intermediate-files: Automatically delete the intermediate files - after they've been converted. - -``` - -On a 1gb/sec Internet connection, this will take about 25 minutes. - -The script detects if the datasets have already been downloaded, extracted -and/or converted and skips those steps as appropriate so if you've run the -script without the cleanup options, you can just run it again with those -options to clean them up. - -Now you're ready to train a wake word. Almost. - -## Train a Wake Word - -Training is done in 3 stages. - -1. Generate thousands of samples of the wake word with various voices, -pitches, speeds, inflections, etc. -2. Augment the samples with the training datasets to add background noise, etc. -3. Run the Tensorflow training. - -### Generate a sample for verification - -Before you start the full process, you're going to want to generate a single -wake word sample and play it back to ensure it sounds right. The wake word -should be spelled phonetically to give the sample generator the best chance -of success. - -```text -root@mww-cli:/# wake_word_sample_generator --samples=1 "hey buster" -===== Generating 1 sample of 'hey buster' ===== - Loading /data/tools/piper-sample-generator/models/en_US-libritts_r-medium.pt - Successfully loaded the model - Batch 1/0 complete - Done -Sample available at /data/work/test_sample/hey_buster.wav -Play it from your host. -``` - -You should then play that file from your host. The reason I used "hey buster" -as the wake word is to demonstrate why it's important to generate and listen -to a sample. If you try that exact input and play it back, you'll notice -that the generator didn't capture the "er" at the end very well. To get it to -do so, I had to add a period on the end as a "spacer". -"hey buster." worked much better. - -When you're happy with the sample, you can run the full process. - -### Run the full training process - -```text -train_wake_word [ --samples= ] [ --batch-size= ] - [ --training-steps= ] [ --cleanup-work-dir ] - [ ] - -Options: ---samples: The number of samples to generate for the wake word. - Default: 20000 - ---batch-size: How many samples should be generated at a time. The more - samples, the more memory is needed. - Default: 100 - ---training-steps: Number of training steps. More training steps means better - detection and false positive rates but also more time to train. - Default: 25000 - ---cleanup-work-dir: Delete the /data/work directory after successful training. - Default: false - - The word to train spelled phonetically. - Required. - - An optional pretty name to save to the json metadata file. - Default: The wake word with individual words capitalized - and punctuation removed. - -``` - -By default, the training process creates 20,000 samples of your wake word and -runs 25,000 training steps. See [Tensorboard Results](#tensorboard-results) -in the [Extra Credit](#extra-credit) section below for -why these are the defaults. Depending on resources available, this could take -between 30 and 60 minutes. - -The resulting tflite model files and logs will be placed in the -`/data/output/---` directory -and will therefore be available from your host in the directory you mapped -`/data` to. File names will have non-filename-friendly characters in your -wake word changed to underscores to make things easier. You'll need both the -tflite and json files to load on your device. Exactly how you load them -depends on the device and is beyond the scope of this project. - -The only real measure of success is how well the resulting model works -on a real device. If you encounter too many missed or false activations, -increasing the number of samples would probably improve the results more -than increasing the number of training steps. See -[Tensorboard Results](#tensorboard-results) in the [Extra Credit](#extra-credit) section below. - -The output from the last step is filtered some by the script but still quite -verbose. The full log will be available in the output directory as -`training.log` if you're interested. Intepreting the log is beyond the scope -of this project however. - -You can train additional wake words or change the number of samples and -training steps by simply running `train_wake_word` again. No need to repeat -any of the earlier setup steps. If you change the wake word or the number of -wake word samples, the work directory will be deleted and all 3 steps re-run. -If you only change the number of training steps, the data from the first two -steps is still valid and only the 3rd step is run. - -All of the intermediate data is stored in the `/data/work` directory which will -grow to about 17gb with 20,000 wake word samples. Once the tflite model is -successfully generated and you're happy with the results, you can delete the -`/data/work` directory. - -### Training more than one wake word - -Once you have a container running, you -can easily train multiple wake words from your host: - -```shell -for wp in "hey_alexa" "hey_jenkins" ; do - docker exec -it mww-cli train_wake_word --cleanup-work-dir "$wp" -done -``` - -### Training time examples - -Training times depend on lots of things. These are examples only. -Your Mileage May Vary!!! - -```text -=============================================================================== - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: N/A - - Generate 10000 samples, 100/batch Elapsed time: 0:06:17 - Augment 10000 samples Elapsed time: 0:04:05 - 10000 training steps Elapsed time: 0:15:04 - ================================================== - Total Elapsed time: 0:25:26 -================================================================================ - -================================================================================ - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb - - Generate 10000 samples, 100/batch Elapsed time: 0:00:29 - Augment 10000 samples Elapsed time: 0:03:40 - 10000 training steps Elapsed time: 0:08:00 - ====================================================== - Total Elapsed time: 0:12:09 -================================================================================ - -================================================================================ - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: N/A - - Generate 20000 samples, 100/batch Elapsed time: 0:10:38 - Augment 20000 samples Elapsed time: 0:07:04 - 25000 training steps Elapsed time: 0:25:21 - ====================================================== - Total Elapsed time: 0:43:03 -================================================================================ - -================================================================================ - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb - - Generate 20000 samples, 100/batch Elapsed time: 0:00:53 - Augment 20000 samples Elapsed time: 0:07:05 - 25000 training steps Elapsed time: 0:19:13 - ====================================================== - Total Elapsed time: 0:27:11 -================================================================================ - -================================================================================ - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: N/A - - Generate 50000 samples, 100/batch Elapsed time: 0:30:47 - Augment 50000 samples Elapsed time: 0:20:22 - 40000 training steps Elapsed time: 1:01:51 - ================================================== - Total Elapsed time: 1:53:00 -================================================================================ - -================================================================================ - Training Summary - -CPU: Intel(R) Core(TM) i7-6950X CPU @ 3.00GHz (20 cores) Memory: 64195 mb -GPU: NVIDIA GeForce RTX 3060 (3584 cores) Memory: 11909 mb - - Generate 50000 samples, 100/batch Elapsed time: 0:02:08 - Augment 50000 samples Elapsed time: 0:19:13 - 40000 training steps Elapsed time: 0:42:23 - ====================================================== - Total Elapsed time: 1:03:44 -================================================================================ - - -``` - -The sample generation process is really the only one that uses multiple CPUs so -having fewer CPU threads available will probably make little difference. - -## Extra Credit - -### Training defaults - -If you plan on training multiple wake words, you can set your own default -training parameters by creating a `/data/.defaults.env` file with the -following contents: - -```shell -# Variable names follow the command line parameters converted to upper case -# and with the dashes ('-') converted to underscores ('_'). -export SAMPLES=10000 -export TRAINING_STEPS=10000 - -# Don't use the GPU for any operations. Stick with the CPU only. -##export CUDA_VISIBLE_DEVICES=-1 - -``` - -### Examine your model with Tensorboard - -Tensorboard is a web-based graphical model viewer. You can use it to get an -idea of how many training steps are needed before accuracy results stop -improving. To use it, you'll have to expose port 6006 by adding `-p -6006:6006` to your `docker run` command line. If you didn't, don't worry. -Remember, the /data directory is mapped to a directory on your host so you -can simply stop and delete the current container and recreate it with the new -`docker run` command. No need to re-run any of the setup or training steps. - -To start Tensorboard, run: - -```shell -root@mww-cli:/# cd /data -root@mww-cli:/data# source .venv/bin/activate -(.venv) root@mww-cli:/data# tensorboard --bind_all --logdir ./output -``` - -Now on your host, point your browser at `http://localhost:6006/`, -click "SCALARS" at the top and take a look at the various charts. You'll see -a "train" and "validation" item for each training run you've performed. It's -the "train" items you're interested in. - - - -You have to be a Tensorflow expert to decipher most of the charts but -the "Accuracy" chart for this particular wake word and 50,000 samples would -seem to idicate that there's very little improvement after about 20,000 -training steps. - -![Accuracy Chart, 50000 samples](tensorboard1.png) - -In contrast, with only 5,000 wake word samples, there's still improvement to be had after -20,000 training steps. - -![Accuracy Chart, 5000 samples](tensorboard2.png) - -Given that it's faster to generate wake word samples than it is to train, -20,000 samples and 25,000 training steps seems like a good compromise. This -chart has a bit less smoothing to show a bit more detail and includes the -50,000 sample run as well. This run took only 27 minutes as opposed to the -63 minutes it took for the 50,000 sample run. Now you know why 20,000 and -25,000 are the defaults for these scripts. - -![Accuracy Chart, 25000 samples](tensorboard3.png) - - - - - - diff --git a/cli/requirements.txt b/cli/requirements.txt deleted file mode 100644 index a0e801b..0000000 --- a/cli/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -# --- Packages needed by our scripts --- - -numpy==1.26.4 -scipy==1.12.0 -librosa==0.10.2.post1 -soundfile==0.12.1 -tqdm==4.67.1 -scikit-learn==1.6.0 -numba==0.63.1 -PyYAML==6.0.3 diff --git a/cli/setup_python_venv b/cli/setup_python_venv index 153d43d..4a77557 100755 --- a/cli/setup_python_venv +++ b/cli/setup_python_venv @@ -1,5 +1,6 @@ #!/bin/bash -PROGDIR="$(dirname $(realpath $0))" +PROGDIR="$(dirname "$(realpath "$0")")" +ROOTDIR="$(dirname "${PROGDIR}")" KNOWN_ARGS=( data-dir python gpu no-gpu ) source "${PROGDIR}/shell.functions" @@ -27,7 +28,7 @@ EOF exit 1 fi -[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath ${DATA_DIR})" +[ -n "${DATA_DIR}" ] && DATA_DIR="$(realpath "${DATA_DIR}")" [ -d "${DATA_DIR}" ] || { echo "Data directory '${DATA_DIR}' doesn't exist." >&2 exit 1 @@ -52,7 +53,8 @@ if [ -n "${PYTHON}" ] ; then PYTHONS=( "${PYTHON}" ) unset PYTHON else - PYTHONS=( python3.12 python3.10 ) + # Add 3.11 as a common middle-ground (especially outside Ubuntu 24.04) + PYTHONS=( python3.12 python3.11 python3.10 ) fi for p in "${PYTHONS[@]}" ; do @@ -60,14 +62,14 @@ for p in "${PYTHONS[@]}" ; do done [ -n "${PYTHON}" ] || { - echo "A python 3.12 or 3.10 interpreter wasn't found. You 'll need to install one before proceeding." >&2 + echo "A python 3.12/3.11/3.10 interpreter wasn't found. You'll need to install one before proceeding." >&2 exit 1 } -if [ -d "${VENV}" ] ; then +if [ -d "${VENV}" ] ; then if [ -f "${DATA_DIR}/.mww-data-dir" ] ; then source "${VENV}/bin/activate" || { - echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2 + echo "Unable to activate existing virtualenv '${VENV}'. You should delete it and try again." >&2 exit 1 } else @@ -82,24 +84,28 @@ if [ -z "$VIRTUAL_ENV" ] ; then else echo " ===== Updating virtualenv at '${VENV}' =====" fi + ${PYTHON} -m venv --upgrade-deps "${VENV}" source "${VENV}/bin/activate" set -euo pipefail -declare -a progfiles=( $(find ${PROGDIR} -mindepth 1 -maxdepth 1 -executable -type f) ) +# Symlink CLI scripts into .venv/bin +declare -a progfiles=( $(find "${PROGDIR}" -mindepth 1 -maxdepth 1 -executable -type f) ) progfiles+=( "${PROGDIR}/shell.functions" ) +# Also symlink the top-level entrypoint if present +[ -x "${ROOTDIR}/train_wake_word" ] && progfiles+=( "${ROOTDIR}/train_wake_word" ) + for f in "${progfiles[@]}" ; do - ln -sfr "${f}" ".venv/bin/$(basename ${f})" + ln -sfr "${f}" ".venv/bin/$(basename "${f}")" done # -# Pip doesn't process packages from requirements.txt in -# order but order is important because tensorflow, torch, -# onnxruntime and micro-wake-word all depend on CUDA packages -# at various versions. They need to be installed in this specific -# order or they may not be able to use the GPU. +# Pip doesn't process packages from requirements.txt in order but order is +# important because tensorflow, torch, onnxruntime and micro-wake-word all +# depend on CUDA packages at various versions. They need to be installed in +# this specific order or they may not be able to use the GPU. # export PIP_PROGRESS_BAR=off export PIP_NO_COLOR=1 @@ -117,7 +123,8 @@ pip_install() { START_TS=$EPOCHSECONDS echo " ===== Installing common requirements =====" -pip_install -r "${PROGDIR}/requirements.txt" +# requirements.txt lives in repo root now +pip_install -r "${ROOTDIR}/requirements.txt" ${GPU} && tfgpu='[and-cuda]' || tfgpu="" echo " ===== Installing Tensorflow${tfgpu} =====" @@ -140,7 +147,7 @@ pip_install -e "${MWW}" echo " ===== Checking piper-sample-generator =====" PSG="${DATA_DIR}/tools/piper-sample-generator" -if [ ! -d "${PSG}" ] || [ -n "$(git -C ${PSG} status --porcelain)" ] ; then +if [ ! -d "${PSG}" ] || [ -n "$(git -C "${PSG}" status --porcelain)" ] ; then rm -rf "${PSG}" || : echo " Cloning piper-sample-generator to ${DATA_DIR}/tools" git clone https://github.com/rhasspy/piper-sample-generator "${PSG}" &>/dev/null @@ -171,13 +178,11 @@ echo " ===== Installing keras =====" # keras 3.13 has "issues" so we need to back down to 3.12. pip_install "keras==3.12.0" -${PROGDIR}/test_python --data-dir="${DATA_DIR}" +"${PROGDIR}/test_python" --data-dir="${DATA_DIR}" touch .mww-data-dir END_TS=$EPOCHSECONDS echo "Run 'source ${VENV}/bin/activate' to activate the new virtualenv in the current shell." -print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" - - +print_elapsed_time "${START_TS}" "${END_TS}" "Python package installation complete" \ No newline at end of file diff --git a/cli/setup_training_datasets b/cli/setup_training_datasets index fc6e280..c343e95 100755 --- a/cli/setup_training_datasets +++ b/cli/setup_training_datasets @@ -1,8 +1,9 @@ #!/bin/bash set -euo pipefail -PROGPATH=$(realpath "$0") -PROGDIR=$(dirname "${PROGPATH}") +PROGPATH="$(realpath "$0")" +PROGDIR="$(dirname "${PROGPATH}")" +ROOTDIR="$(dirname "${PROGDIR}")" # repo root (train_wake_word, requirements.txt, etc.) KNOWN_ARGS=( data-dir cleanup-archives cleanup-intermediate-files ) source "${PROGDIR}/shell.functions" @@ -27,22 +28,38 @@ EOF exit 1 fi +# Normalize + validate DATA_DIR (shell.functions typically sets a default, +# but this makes the script standalone-safe) +[ -n "${DATA_DIR:-}" ] && DATA_DIR="$(realpath "${DATA_DIR}")" +[ -d "${DATA_DIR}" ] || { + echo "Data directory '${DATA_DIR}' doesn't exist." >&2 + exit 1 +} + cd "${DATA_DIR}" START_TS=$EPOCHSECONDS echo -e "\n===== Setting up Training Datasets =====\n" -${PROGDIR}/setup_negative_datasets --cleanup-archives=${CLEANUP_ARCHIVES} \ - --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" +"${PROGDIR}/setup_negative_datasets" \ + --cleanup-archives="${CLEANUP_ARCHIVES}" \ + --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \ + --data-dir="${DATA_DIR}" -${PROGDIR}/setup_mit_audio --cleanup-archives=${CLEANUP_ARCHIVES} \ - --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" +"${PROGDIR}/setup_mit_audio" \ + --cleanup-archives="${CLEANUP_ARCHIVES}" \ + --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \ + --data-dir="${DATA_DIR}" -${PROGDIR}/setup_audioset --cleanup-archives=${CLEANUP_ARCHIVES} \ - --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" +"${PROGDIR}/setup_audioset" \ + --cleanup-archives="${CLEANUP_ARCHIVES}" \ + --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \ + --data-dir="${DATA_DIR}" -${PROGDIR}/setup_fma --cleanup-archives=${CLEANUP_ARCHIVES} \ - --cleanup-intermediate-files=${CLEANUP_INTERMEDIATE_FILES} --data-dir="${DATA_DIR}" +"${PROGDIR}/setup_fma" \ + --cleanup-archives="${CLEANUP_ARCHIVES}" \ + --cleanup-intermediate-files="${CLEANUP_INTERMEDIATE_FILES}" \ + --data-dir="${DATA_DIR}" -END_TS=$(date +%s.%N) +END_TS=$EPOCHSECONDS print_elapsed_time "${START_TS}" "${END_TS}" "Training dataset setup" diff --git a/cli/tensorboard1.png b/cli/tensorboard1.png deleted file mode 100644 index a7741d9..0000000 Binary files a/cli/tensorboard1.png and /dev/null differ diff --git a/cli/tensorboard2.png b/cli/tensorboard2.png deleted file mode 100644 index 9042fdc..0000000 Binary files a/cli/tensorboard2.png and /dev/null differ diff --git a/cli/tensorboard3.png b/cli/tensorboard3.png deleted file mode 100644 index 6df0306..0000000 Binary files a/cli/tensorboard3.png and /dev/null differ diff --git a/cli/wake_word_sample_augmenter b/cli/wake_word_sample_augmenter old mode 100755 new mode 100644 diff --git a/cli/wake_word_sample_trainer b/cli/wake_word_sample_trainer old mode 100755 new mode 100644 diff --git a/dockerfile b/dockerfile index 52fa989..eb4694c 100644 --- a/dockerfile +++ b/dockerfile @@ -1,59 +1,37 @@ -# Standard Ubuntu base image. CUDA base images not needed. -FROM ubuntu:22.04 +# Base +FROM ubuntu:24.04 -ENV DEBIAN_FRONTEND=noninteractive \ - PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 \ - PIP_ROOT_USER_ACTION=ignore \ - HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ - XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/local/cuda" \ - PATH="/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV DEBIAN_FRONTEND=noninteractive -# System deps (+dev headers for building C/C++ extensions) +# System deps RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.10 python3.10-venv python3.10-distutils python3.10-dev python3-pip \ - git wget curl unzip ca-certificates git-lfs \ - build-essential g++ cmake \ - libsndfile1 libsndfile1-dev libffi-dev \ - ffmpeg \ - && rm -rf /var/lib/apt/lists/* + python3.12 python3.12-venv python3.12-dev python3-pip python-is-python3 \ + git wget curl unzip ca-certificates nano less \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /data -# Use python3.10 everywhere -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 \ - && update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 +# Recorder port +EXPOSE 8789 -# ---- No cuDNN repo meddling needed if using TF 2.17.x ---- +# Script root +WORKDIR /root/mww-scripts -# Python deps -# Order is important. onnxruntime, tensorflow and torch have -# to be installed in the order below or their cuda dependencies -# will conflict. -COPY requirements.txt /tmp/requirements.txt -RUN pip install --upgrade pip \ - && pip install "numpy==1.26.4" "cython>=0.29.36" \ - && pip install -r /tmp/requirements.txt \ - && pip install "onnxruntime-gpu[cuda]>=1.16.0" \ - && pip install "tensorflow[and-cuda]==2.18.0" \ - "tensorboard==2.18.0" \ - "tensorboard-data-server==0.7.2" \ - "tensorflow-io-gcs-filesystem==0.37.1" \ - && pip install \ - torch==2.7.1 \ - torchaudio==2.7.1 \ - --index-url https://download.pytorch.org/whl/cu128 +# Bash environment +COPY --chown=root:root --chmod=0755 .bashrc /root/ -# Workspace + notebook fallback -RUN mkdir -p /data -WORKDIR /data -COPY microWakeWord_training_notebook.ipynb /root/ +# Root-level entrypoints +COPY --chown=root:root --chmod=0755 \ + train_wake_word \ + run_recorder.sh \ + recorder_server.py \ + requirements.txt \ + /root/mww-scripts/ -# Startup script (copies default notebook if missing) -COPY startup.sh /usr/local/bin/startup.sh -RUN chmod +x /usr/local/bin/startup.sh +# CLI folder (THIS IS THE IMPORTANT CHANGE) +COPY --chown=root:root cli/ /root/mww-scripts/cli/ -EXPOSE 8888 +# Static UI for recorder +COPY --chown=root:root --chmod=0644 static/index.html /root/mww-scripts/static/index.html -CMD ["/bin/bash", "-lc", "/usr/local/bin/startup.sh && \ - exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root \ - --ServerApp.token='' --ServerApp.password='' --ServerApp.root_dir=/data"] +# recorder server +CMD ["/bin/bash", "-lc", "/root/mww-scripts/run_recorder.sh"] diff --git a/microWakeWord_training_notebook.ipynb b/microWakeWord_training_notebook.ipynb deleted file mode 100644 index 19b54dc..0000000 --- a/microWakeWord_training_notebook.ipynb +++ /dev/null @@ -1,1073 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# šŸ„” MicroWakeWord Trainer — Tater Totterson Edition\n", - "# ==================================================\n", - "# Welcome, friend! šŸ‘‹ This notebook will help you train your very own wake word model.\n", - "# Think of it like teaching Tater Totterson to recognize when you say a special word.\n", - "#\n", - "# By the end, you'll have:\n", - "# āœ… A trained TensorFlow Lite model ready for on-device detection.\n", - "# āœ… A matching JSON manifest you can drop straight into ESPHome.\n", - "#\n", - "# This flow is optimized for Python 3.10 and NVIDIA GPUs (but should work elsewhere too).\n", - "# You can customize the wake word, play with training parameters, and experiment with\n", - "# different datasets until you get something that feels just right. šŸ’Ŗ\n", - "#\n", - "# ⚔ Quick Tips:\n", - "# • Change TARGET_WORD below to whatever you want your wake word to be.\n", - "# • Rerun the notebook from the top if you change it (to regenerate everything).\n", - "# • Expect to experiment — tweaking hyperparameters is part of the fun!\n", - "#\n", - "# When you’re done, you’ll get two files:\n", - "# 1ļøāƒ£ .tflite — your trained model.\n", - "# 2ļøāƒ£ .json — a manifest for ESPHome integration.\n", - "#\n", - "# More info & examples:\n", - "# šŸ”— https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker\n", - "\n", - "# --- Set your wake word here ---\n", - "TARGET_WORD = \"tater\" # šŸ—£ļø Change this to whatever phrase you want!\n", - "print(f\"šŸ„” Tater Totterson is listening for: '{TARGET_WORD}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BFf6511E65ff" - }, - "outputs": [], - "source": [ - "import platform\n", - "import sys\n", - "import os\n", - "\n", - "# mac-only helper deps\n", - "if platform.system() == \"Darwin\":\n", - " !\"{sys.executable}\" -m pip install 'git+https://github.com/puddly/pymicro-features@puddly/minimum-cpp-version' --root-user-action=ignore\n", - "\n", - "!\"{sys.executable}\" -m pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f' --root-user-action=ignore\n", - "\n", - "# šŸ‘‡ use the actual location in the container\n", - "repo_path = \"/data/microWakeWord\"\n", - "\n", - "if not os.path.exists(repo_path):\n", - " print(\"ā¬‡ļø Cloning microWakeWord repository to /data…\")\n", - " !git clone https://github.com/TaterTotterson/micro-wake-word.git {repo_path}\n", - "\n", - "# optional: pin to a commit\n", - "# !cd /data/microWakeWord && git checkout ac6502bf48b5e372c47ed509f5f5ca181e6d50bb\n", - "\n", - "if os.path.exists(repo_path):\n", - " print(\"šŸ“¦ Installing microWakeWord...\")\n", - " !\"{sys.executable}\" -m pip install -e {repo_path} --root-user-action=ignore\n", - "else:\n", - " print(f\"āŒ Repository not found at {repo_path}. Clone might have failed.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BFf6511E65ff" - }, - "outputs": [], - "source": [ - "# --- GPU Check (Torch + ONNX Runtime) ---\n", - "\n", - "import torch\n", - "import onnxruntime as ort\n", - "\n", - "print(\"šŸ”§ Torch CUDA Available:\", torch.cuda.is_available())\n", - "if torch.cuda.is_available():\n", - " print(\" • Device count:\", torch.cuda.device_count())\n", - " print(\" • Current device:\", torch.cuda.current_device())\n", - " print(\" • Device name:\", torch.cuda.get_device_name(torch.cuda.current_device()))\n", - "else:\n", - " print(\"āš ļø Torch cannot see a GPU — check Docker runtime (--gpus all) and nvidia-container-toolkit\")\n", - "\n", - "print(\"\\nšŸ”§ ONNX Runtime Providers:\")\n", - "try:\n", - " providers = ort.get_available_providers()\n", - " print(\" •\", providers)\n", - " if \"CUDAExecutionProvider\" not in providers:\n", - " print(\"āš ļø CUDAExecutionProvider not available — ONNX will fall back to CPU.\")\n", - "except Exception as e:\n", - " print(\"āš ļø Could not query ONNX Runtime providers:\", e)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dEluu7nL7ywd" - }, - "outputs": [], - "source": [ - "# NVIDIA Linux Docker: generate 1 sample of the target word (robust + CUDA check)\n", - "\n", - "import os, sys, shutil, subprocess, time, platform\n", - "from pathlib import Path\n", - "from IPython.display import Audio, display\n", - "\n", - "REPO_URL = \"https://github.com/rhasspy/piper-sample-generator\"\n", - "REPO_DIR = Path.cwd() / \"piper-sample-generator\"\n", - "MODELS_DIR = REPO_DIR / \"models\"\n", - "MODEL_NAME = \"en_US-libritts_r-medium.pt\"\n", - "MODEL_URL = f\"https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/{MODEL_NAME}\"\n", - "AUDIO_OUT_DIR = Path.cwd() / \"generated_samples\"\n", - "AUDIO_PATH = AUDIO_OUT_DIR / \"0.wav\"\n", - "\n", - "def run(cmd, check=True):\n", - " print(\"→\", \" \".join(cmd))\n", - " proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)\n", - " for line in proc.stdout:\n", - " print(line, end=\"\")\n", - " rc = proc.wait()\n", - " if check and rc != 0:\n", - " raise RuntimeError(f\"Command failed with exit code {rc}: {' '.join(cmd)}\")\n", - " return rc\n", - "\n", - "def pip_install(*pkgs):\n", - " run([sys.executable, \"-m\", \"pip\", \"install\", \"--upgrade\", \"pip\"], check=False)\n", - " run([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n", - "\n", - "def safe_clone(repo_url, branch=None, dest=REPO_DIR, retries=2):\n", - " if dest.exists() and not (dest / \".git\").exists():\n", - " print(\"āš ļø Found partial clone. Removing…\")\n", - " shutil.rmtree(dest, ignore_errors=True)\n", - " if not dest.exists():\n", - " for i in range(retries + 1):\n", - " try:\n", - " cmd = [\"git\", \"clone\", \"--depth\", \"1\", repo_url, str(dest)]\n", - " if branch:\n", - " cmd = [\"git\", \"clone\", \"--depth\", \"1\", \"--branch\", branch, repo_url, str(dest)]\n", - " run(cmd)\n", - " break\n", - " except Exception as e:\n", - " if i == retries:\n", - " raise\n", - " print(f\"Clone failed ({i+1}/{retries+1}). Retrying in 2s… [{e}]\")\n", - " time.sleep(2)\n", - "\n", - "def ensure_model():\n", - " MODELS_DIR.mkdir(parents=True, exist_ok=True)\n", - " mp = MODELS_DIR / MODEL_NAME\n", - " if not mp.exists() or mp.stat().st_size == 0:\n", - " import urllib.request\n", - " print(f\"Downloading model to {mp} …\")\n", - " with urllib.request.urlopen(MODEL_URL) as r, open(mp, \"wb\") as f:\n", - " shutil.copyfileobj(r, f)\n", - " if mp.stat().st_size < 100 * 1024:\n", - " raise RuntimeError(\"Downloaded model looks too small; download may have failed.\")\n", - " print(f\"āœ… Model ready: {mp}\")\n", - "\n", - "# 1) Clone main repo (Linux/NVIDIA)\n", - "print(\"Linux/NVIDIA detected — using main piper-sample-generator repo.\")\n", - "safe_clone(REPO_URL)\n", - "\n", - "# 2) Install deps\n", - "# - piper-tts provides the `piper` module (required by generate_samples.py)\n", - "# - piper-phonemize-cross does the phonemization\n", - "# - onnxruntime-gpu enables CUDA (container must have NVIDIA runtime)\n", - "deps = [\n", - " \"piper-tts>=1.2.0\",\n", - " \"piper-phonemize-cross==1.2.1\",\n", - " \"soundfile\",\n", - " \"numpy\",\n", - " \"onnxruntime-gpu>=1.16.0\",\n", - "]\n", - "pip_install(*deps)\n", - "\n", - "# 3) Verify CUDA provider is available\n", - "try:\n", - " import onnxruntime as ort\n", - " providers = ort.get_available_providers()\n", - " print(f\"ONNX Runtime providers: {providers}\")\n", - " if \"CUDAExecutionProvider\" not in providers:\n", - " print(\"āš ļø CUDAExecutionProvider not available. \"\n", - " \"The sample will still run on CPU, but check your NVIDIA container setup \"\n", - " \"(nvidia-container-toolkit, runtime, and driver).\")\n", - "except Exception as e:\n", - " print(\"āš ļø Could not import onnxruntime to verify providers:\", e)\n", - "\n", - "# 4) Ensure model present\n", - "ensure_model()\n", - "\n", - "# 5) Generate one sample\n", - "AUDIO_OUT_DIR.mkdir(parents=True, exist_ok=True)\n", - "gen_script = REPO_DIR / \"generate_samples.py\"\n", - "if not gen_script.exists():\n", - " raise FileNotFoundError(f\"Missing generator: {gen_script}\")\n", - "\n", - "cmd = [\n", - " sys.executable, str(gen_script),\n", - " TARGET_WORD,\n", - " \"--model\", str(MODELS_DIR / MODEL_NAME), # ← pass the generator .pt explicitly\n", - " \"--max-samples\", \"1\",\n", - " \"--batch-size\", \"1\",\n", - " \"--output-dir\", str(AUDIO_OUT_DIR),\n", - "]\n", - "run(cmd)\n", - "\n", - "# 6) Play the audio (if the notebook frontend supports it)\n", - "if AUDIO_PATH.exists():\n", - " print(f\"šŸŽ§ Playing {AUDIO_PATH}\")\n", - " display(Audio(str(AUDIO_PATH), autoplay=True))\n", - "else:\n", - " print(f\"Audio file not found at {AUDIO_PATH}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-SvGtCCM9akR" - }, - "outputs": [], - "source": [ - "# Generate a large number of wake word samples for training (with length-scale sweep)\n", - "import sys, subprocess\n", - "from pathlib import Path\n", - "\n", - "REPO_DIR = Path.cwd() / \"piper-sample-generator\"\n", - "MODELS_DIR = REPO_DIR / \"models\"\n", - "MODEL_NAME = \"en_US-libritts_r-medium.pt\"\n", - "\n", - "MAX_SAMPLES = 50000\n", - "BATCH_SIZE = 100\n", - "\n", - "# Piper \"speed\" control via piper-sample-generator is length_scale(s)\n", - "LENGTH_SCALES = [\"0.85\", \"0.95\", \"1.00\", \"1.05\", \"1.15\"]\n", - "\n", - "cmd = [\n", - " sys.executable,\n", - " str(REPO_DIR / \"generate_samples.py\"),\n", - " TARGET_WORD,\n", - " \"--model\", str(MODELS_DIR / MODEL_NAME),\n", - " \"--max-samples\", str(MAX_SAMPLES),\n", - " \"--batch-size\", str(BATCH_SIZE),\n", - " \"--output-dir\", \"generated_samples\",\n", - " \"--length-scales\", *LENGTH_SCALES,\n", - "]\n", - "\n", - "print(\"→\", \" \".join(cmd))\n", - "subprocess.run(cmd, check=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YJRG4Qvo9nXG" - }, - "outputs": [], - "source": [ - "# NVIDIA/Linux dataset prep to match the Apple behavior, but with pinned AudioSet\n", - "# MIT RIR -> resample to 16 kHz\n", - "# AudioSet -> fetch from a working HF revision, convert to 16 kHz mono, skip bad\n", - "# FMA -> resample to 16 kHz mono\n", - "\n", - "import os, sys, subprocess, scipy.io.wavfile, numpy as np\n", - "from pathlib import Path\n", - "from tqdm import tqdm\n", - "import soundfile as sf\n", - "import librosa\n", - "from datasets import load_dataset\n", - "\n", - "# -------------------------------------------------\n", - "# small shell helpers (for curl/tar probing)\n", - "# -------------------------------------------------\n", - "def sh(cmd: str) -> int:\n", - " return subprocess.call(cmd, shell=True)\n", - "\n", - "def curl(url: str, out: Path) -> int:\n", - " # -L follow, -s silent, --fail to get nonzero on 404\n", - " return subprocess.call(f\"curl -L -s --fail '{url}' -o '{out}'\", shell=True)\n", - "\n", - "def write_wav(dst: Path, data: np.ndarray, sr: int):\n", - " x = np.clip(data, -1.0, 1.0)\n", - " scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))\n", - "\n", - "# -----------------------------\n", - "# MIT RIR (resample to 16 kHz)\n", - "# -----------------------------\n", - "print(\"=== MIT RIR ===\")\n", - "rir_out = Path(\"mit_rirs\")\n", - "rir_out.mkdir(exist_ok=True)\n", - "if not any(rir_out.rglob(\"*.wav\")):\n", - " ok = 0\n", - " try:\n", - " # Avoid datasets.Audio to keep TorchCodec out:\n", - " # Use streaming=True + manual decode with librosa\n", - " print(\"ā¬‡ļø MIT RIR (streaming + manual decode)…\")\n", - " ds = load_dataset(\n", - " \"davidscripka/MIT_environmental_impulse_responses\",\n", - " split=\"train\",\n", - " streaming=True\n", - " )\n", - " for i, row in enumerate(tqdm(ds)):\n", - " try:\n", - " audio_path = row[\"audio\"][\"path\"]\n", - " y, sr = librosa.load(audio_path, sr=16000, mono=True)\n", - " write_wav(rir_out / f\"rir_{i:04d}.wav\", y, 16000)\n", - " ok += 1\n", - " except Exception:\n", - " pass\n", - " print(f\"āœ… MIT RIR saved: {ok} files\")\n", - " except Exception as e:\n", - " print(f\"āš ļø MIT RIR download failed: {e}\")\n", - " # Fallback ZIP route\n", - " try:\n", - " print(\"ā¬‡ļø MIT RIR (fallback ZIP)…\")\n", - " zip_url = \"https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip\"\n", - " zip_path = rir_out.parent / \"MIT_RIR_Audio.zip\"\n", - " if not zip_path.exists():\n", - " os.system(f\"wget -q -O '{zip_path}' '{zip_url}'\")\n", - " os.system(f'unzip -q -o \"{zip_path}\" -d \"{rir_out}\"')\n", - " # Normalize to 16k mono\n", - " for p in tqdm(list(rir_out.rglob(\"*.wav\")), desc=\"Normalize MIT RIR\"):\n", - " a, sr = sf.read(p, always_2d=False)\n", - " if a.ndim > 1:\n", - " a = a[:, 0]\n", - " if sr != 16000:\n", - " a, _ = librosa.load(p, sr=16000, mono=True)\n", - " write_wav(p, a, 16000)\n", - " print(\"āœ… MIT RIR fallback complete\")\n", - " except Exception as e2:\n", - " print(f\"āŒ MIT RIR fallback failed: {e2}\")\n", - "else:\n", - " print(\"āœ… mit_rirs exists; skipping.\")\n", - "\n", - "# ============================================================\n", - "# AudioSet (pinned FLAC .tar → 16k mono, skip bad files)\n", - "# ============================================================\n", - "print(\"\\n=== AudioSet subset (pinned FLAC .tar → 16k mono) ===\")\n", - "audioset_dir = Path(\"audioset\"); audioset_dir.mkdir(exist_ok=True)\n", - "audioset_out = Path(\"audioset_16k\"); audioset_out.mkdir(exist_ok=True)\n", - "\n", - "if any(audioset_out.rglob(\"*.wav\")):\n", - " print(\"āœ… audioset_16k exists; skipping.\")\n", - "else:\n", - " # commits / refs we know about — we’ll probe them\n", - " REV_CANDIDATES = [\n", - " \"6762f044d1c88619c7f2006486036192128fb07e\",\n", - " \"0049167e89f259a010c3f070fe3666d9e5242836\",\n", - " \"ceb9eaaa7844c9ad7351e659c84a572e376ad06d\",\n", - " \"main\", # last resort\n", - " ]\n", - " # possible folder layouts\n", - " TAR_PATTERNS = [\n", - " \"data/bal_train0{idx}.tar\",\n", - " \"data/bal_train/bal_train0{idx}.tar\",\n", - " ]\n", - "\n", - " def find_working_rev():\n", - " for rev in REV_CANDIDATES:\n", - " for pat in TAR_PATTERNS:\n", - " probe = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{pat.format(idx=0)}\"\n", - " rc = sh(f\"curl -I -L --fail -s '{probe}' > /dev/null\")\n", - " if rc == 0:\n", - " return rev, pat\n", - " return None, None\n", - "\n", - " rev, pattern = find_working_rev()\n", - " if rev is None:\n", - " raise RuntimeError(\"Could not locate an AudioSet revision with FLAC tarballs still present on HF.\")\n", - "\n", - " print(f\"šŸ“Œ Using AudioSet revision: {rev}\")\n", - " print(f\"šŸ—‚ļø Tar layout pattern: {pattern}\")\n", - "\n", - " # download + extract bal_train00..09\n", - " for i in range(10):\n", - " rel = pattern.format(idx=i)\n", - " url = f\"https://huggingface.co/datasets/agkphysics/AudioSet/resolve/{rev}/{rel}\"\n", - " fname = rel.split(\"/\")[-1]\n", - " out_tar = audioset_dir / fname\n", - " if not out_tar.exists():\n", - " print(f\"ā¬‡ļø {fname}\")\n", - " rc = curl(url, out_tar)\n", - " if rc != 0:\n", - " print(f\"āš ļø Could not fetch {fname} at rev {rev}; continuing.\")\n", - " continue\n", - " print(f\"šŸ“¦ Extract {fname}\")\n", - " rc = sh(f\"tar -xf '{out_tar}' -C '{audioset_dir}'\")\n", - " if rc != 0:\n", - " print(f\"āš ļø tar extract failed for {fname}; continuing.\")\n", - "\n", - " # convert FLAC → 16k mono WAV\n", - " flacs = list(audioset_dir.rglob(\"*.flac\"))\n", - " print(f\"šŸ”Ž FLAC files: {len(flacs)}\")\n", - " audioset_bad = []\n", - " ok = 0\n", - " for p in tqdm(flacs, desc=\"AudioSet→WAV (resample 16k mono)\"):\n", - " try:\n", - " y, _ = librosa.load(p, sr=16000, mono=True)\n", - " if y.size == 0:\n", - " raise ValueError(\"empty audio\")\n", - " write_wav(audioset_out / (p.stem + \".wav\"), y, 16000)\n", - " ok += 1\n", - " except Exception as e:\n", - " audioset_bad.append(f\"{p}:{e}\")\n", - "\n", - " if audioset_bad:\n", - " (audioset_out / \"audioset_corrupted_files.log\").write_text(\"\\n\".join(audioset_bad))\n", - " print(f\"āœ… AudioSet complete ({ok} ok, {len(audioset_bad)} failed)\")\n", - "\n", - "# -----------------------------\n", - "# FMA xsmall (resample to 16 kHz mono)\n", - "# -----------------------------\n", - "print(\"\\n=== FMA xsmall ===\")\n", - "fma_zip_dir = Path(\"fma\"); fma_zip_dir.mkdir(exist_ok=True)\n", - "fma_out = Path(\"fma_16k\"); fma_out.mkdir(exist_ok=True)\n", - "\n", - "zipname = \"fma_xs.zip\"\n", - "zipurl = f\"https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/{zipname}\"\n", - "zipout = fma_zip_dir / zipname\n", - "if not zipout.exists():\n", - " os.system(f\"wget -q -O '{zipout}' '{zipurl}'\")\n", - " os.system(f\"cd fma && unzip -q '{zipname}'\")\n", - "\n", - "mp3s = list(Path(\"fma/fma_small\").rglob(\"*.mp3\"))\n", - "print(f\"šŸŽµ FMA mp3 count: {len(mp3s)}\")\n", - "corrupt = []\n", - "for p in tqdm(mp3s, desc=\"FMA→16k WAV\"):\n", - " try:\n", - " y, sr = librosa.load(p, sr=16000, mono=True)\n", - " if y.size == 0:\n", - " raise ValueError(\"empty audio\")\n", - " write_wav(fma_out / (p.stem + \".wav\"), y, 16000)\n", - " except Exception as e:\n", - " corrupt.append(f\"{p}:{e}\")\n", - "if corrupt:\n", - " Path(\"fma_corrupted_files.log\").write_text(\"\\n\".join(corrupt))\n", - "\n", - "print(\"\\nāœ… Dataset prep complete!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XW3bmbI5-JAz" - }, - "outputs": [], - "source": [ - "# Sets up the augmentations.\n", - "# To improve your model, experiment with these settings and use more sources of\n", - "# background clips.\n", - "import sys, os\n", - "from pathlib import Path\n", - "\n", - "# try the common places we’ve used\n", - "candidates = [\n", - " \"/data/microWakeWord\", # what the last install log showed\n", - " \"/data/microwakeword\", # lowercase variant\n", - " \"./microwakeword\", # local clone\n", - " \"./microWakeWord\", # camel case\n", - "]\n", - "\n", - "for base in candidates:\n", - " if os.path.isdir(base):\n", - " # add the repo root\n", - " sys.path.insert(0, base)\n", - " # add the actual package dir inside the repo\n", - " if os.path.isdir(os.path.join(base, \"microwakeword\")):\n", - " sys.path.insert(0, os.path.join(base, \"microwakeword\"))\n", - " break\n", - "from microwakeword.audio.augmentation import Augmentation\n", - "from microwakeword.audio.clips import Clips\n", - "from microwakeword.audio.spectrograms import SpectrogramGeneration\n", - "\n", - "def validate_directories(paths):\n", - " for path in paths:\n", - " if not os.path.exists(path):\n", - " print(f\"Error: Directory {path} does not exist. Please ensure preprocessing is complete.\")\n", - " return False\n", - " return True\n", - "\n", - "# Paths to augmented data\n", - "impulse_paths = ['mit_rirs']\n", - "background_paths = ['fma_16k', 'audioset_16k']\n", - "\n", - "if not validate_directories(impulse_paths + background_paths):\n", - " raise ValueError(\"One or more required directories are missing.\")\n", - "\n", - "# Process TTS generated samples (default)\n", - "clips_tts = Clips(\n", - " input_directory='./generated_samples',\n", - " file_pattern='*.wav',\n", - " max_clip_duration_s=5,\n", - " remove_silence=True,\n", - " random_split_seed=10,\n", - " split_count=0.1,\n", - ")\n", - "\n", - "# Process personal recordings if available (optional)\n", - "clips_personal = None\n", - "if os.path.exists(\"./personal_samples\") and any(Path(\"./personal_samples\").glob(\"*.wav\")):\n", - " clips_personal = Clips(\n", - " input_directory=\"./personal_samples\",\n", - " file_pattern=\"*.wav\",\n", - " max_clip_duration_s=5,\n", - " remove_silence=True,\n", - " random_split_seed=10,\n", - " split_count=0.1,\n", - " )\n", - " print(\"āœ… Found personal samples, will create separate feature set\")\n", - "\n", - "augmenter = Augmentation(\n", - " augmentation_duration_s=3.2,\n", - " augmentation_probabilities={\n", - " \"SevenBandParametricEQ\": 0.1,\n", - " \"TanhDistortion\": 0.05,\n", - " \"PitchShift\": 0.15,\n", - " \"BandStopFilter\": 0.1,\n", - " \"AddColorNoise\": 0.1,\n", - " \"AddBackgroundNoise\": 0.7,\n", - " \"Gain\": 0.8,\n", - " \"RIR\": 0.7,\n", - " },\n", - " impulse_paths=impulse_paths,\n", - " background_paths=background_paths,\n", - " background_min_snr_db=5,\n", - " background_max_snr_db=10,\n", - " min_jitter_s=0.2,\n", - " max_jitter_s=0.3,\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "V5UsJfKKD1k9" - }, - "outputs": [], - "source": [ - "# Augment a random generated-sample WAV and play it back (pass ndarray to augmenter)\n", - "from pathlib import Path\n", - "from IPython.display import Audio, display\n", - "import numpy as np\n", - "import soundfile as sf\n", - "import librosa, random, glob\n", - "\n", - "output_dir = Path(\"./augmented_clips\")\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# 1) Pick a random WAV from the Piper outputs\n", - "candidates = glob.glob(\"generated_samples/*.wav\")\n", - "if not candidates:\n", - " raise SystemExit(\"No files in generated_samples/. Run the TTS sample cell first.\")\n", - "src_path = random.choice(candidates)\n", - "\n", - "# 2) Load as 16 kHz mono float32\n", - "y, sr = librosa.load(src_path, sr=16000, mono=True)\n", - "y = y.astype(np.float32, copy=False)\n", - "\n", - "# 3) Augment — microwakeword Augmentation expects a 1-D numpy array\n", - "try:\n", - " y_aug = augmenter.augment_clip(y)\n", - "except Exception as e:\n", - " # some versions accept (samples, sr) — try that as a fallback\n", - " try:\n", - " y_aug = augmenter.augment_clip((y, sr))\n", - " except Exception:\n", - " raise\n", - "\n", - "# 4) Save and play\n", - "out_path = output_dir / \"augmented_clip.wav\"\n", - "sf.write(str(out_path), y_aug.astype(np.float32, copy=False), sr, subtype=\"PCM_16\")\n", - "print(f\"Augmented clip saved to {out_path}\")\n", - "display(Audio(str(out_path), autoplay=True))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D7BHcY1mEGbK" - }, - "outputs": [], - "source": [ - "# Augment samples and save the training, validation, and testing sets.\n", - "# This version avoids datasets.Audio entirely by driving Clips from local WAVs.\n", - "\n", - "import os, glob, random\n", - "from pathlib import Path\n", - "import types\n", - "import numpy as np\n", - "import librosa\n", - "from mmap_ninja.ragged import RaggedMmap\n", - "from microwakeword.audio.spectrograms import SpectrogramGeneration\n", - "\n", - "# ---- Patch: drive clips from generated_samples/*.wav (no datasets.Audio, no torchcodec) ----\n", - "def audio_generator_from_wavs(self, split=\"train\", repeat=1, source_dir=\"generated_samples\"):\n", - " \"\"\"\n", - " Yield 1-D float32 arrays loaded via librosa from source_dir/*.wav.\n", - " Deterministic 80/10/10 split with seed 10 to mirror original Clips behavior.\n", - " \"\"\"\n", - " files = sorted(glob.glob(f\"{source_dir}/*.wav\"))\n", - " if not files:\n", - " raise SystemExit(f\"āŒ No WAVs in {source_dir}/. Generate samples first.\")\n", - "\n", - " rng = random.Random(10) # deterministic shuffling like Clips(random_split_seed=10)\n", - " files_shuf = files[:]\n", - " rng.shuffle(files_shuf)\n", - "\n", - " n = len(files_shuf)\n", - " n_val = max(1, int(0.10 * n))\n", - " n_test = max(1, int(0.10 * n))\n", - " n_train = max(0, n - n_val - n_test)\n", - " splits = {\n", - " \"train\": files_shuf[:n_train],\n", - " \"validation\": files_shuf[n_train:n_train + n_val],\n", - " \"test\": files_shuf[n_train + n_val:],\n", - " }\n", - " file_list = splits.get(split, [])\n", - " if not file_list:\n", - " return # nothing to yield\n", - "\n", - " for _ in range(max(1, int(repeat))):\n", - " for p in file_list:\n", - " y, sr = librosa.load(p, sr=16000, mono=True)\n", - " yield y.astype(np.float32, copy=False)\n", - "\n", - "# Bind the patched generator to clips_tts instance\n", - "def audio_generator_tts(self, split=\"train\", repeat=1):\n", - " return audio_generator_from_wavs(self, split, repeat, \"generated_samples\")\n", - "\n", - "clips_tts.audio_generator = types.MethodType(audio_generator_tts, clips_tts)\n", - "print(\"āœ… Patched clips_tts.audio_generator to stream from generated_samples/*.wav (no torchcodec).\")\n", - "\n", - "# Bind the patched generator to clips_personal if it exists\n", - "if clips_personal is not None:\n", - " def audio_generator_personal(self, split=\"train\", repeat=1):\n", - " return audio_generator_from_wavs(self, split, repeat, \"personal_samples\")\n", - " clips_personal.audio_generator = types.MethodType(audio_generator_personal, clips_personal)\n", - " print(\"āœ… Patched clips_personal.audio_generator to stream from personal_samples/*.wav (no torchcodec).\")\n", - "\n", - "# ---- Validate augmentation asset folders exist ----\n", - "def validate(paths):\n", - " for p in paths:\n", - " if not Path(p).exists():\n", - " raise SystemExit(f\"āŒ Missing directory: {p}. Run dataset prep first.\")\n", - "\n", - "impulse_paths = [\"mit_rirs\"]\n", - "background_paths = [\"fma_16k\", \"audioset_16k\"]\n", - "validate(impulse_paths + background_paths)\n", - "\n", - "# ---- Output root ----\n", - "out_root = Path(\"generated_augmented_features\")\n", - "out_root.mkdir(exist_ok=True)\n", - "\n", - "# ---- Split config (same as before) ----\n", - "split_cfg = {\n", - " \"training\": {\"name\": \"train\", \"repetition\": 2, \"slide_frames\": 10},\n", - " \"validation\": {\"name\": \"validation\", \"repetition\": 1, \"slide_frames\": 10},\n", - " \"testing\": {\"name\": \"test\", \"repetition\": 1, \"slide_frames\": 1},\n", - "}\n", - "\n", - "# ---- Generate features for TTS samples ----\n", - "for split, cfg in split_cfg.items():\n", - " out_dir = out_root / split\n", - " out_dir.mkdir(parents=True, exist_ok=True)\n", - " print(f\"🧪 Processing {split} (TTS) …\")\n", - "\n", - " spectros = SpectrogramGeneration(\n", - " clips=clips_tts, # now backed by our WAV loader\n", - " augmenter=augmenter, # your existing augmenter\n", - " slide_frames=cfg[\"slide_frames\"],\n", - " step_ms=10,\n", - " )\n", - "\n", - " RaggedMmap.from_generator(\n", - " out_dir=str(out_dir / \"wakeword_mmap\"),\n", - " sample_generator=spectros.spectrogram_generator(\n", - " split=cfg[\"name\"], repeat=cfg[\"repetition\"]\n", - " ),\n", - " batch_size=100,\n", - " verbose=True,\n", - " )\n", - "\n", - "# ---- Generate features for personal samples if available ----\n", - "if clips_personal is not None:\n", - " out_root_personal = Path(\"personal_augmented_features\")\n", - " out_root_personal.mkdir(exist_ok=True)\n", - " for split, cfg in split_cfg.items():\n", - " out_dir = out_root_personal / split\n", - " out_dir.mkdir(parents=True, exist_ok=True)\n", - " print(f\"🧪 Processing {split} (personal) …\")\n", - " spectros = SpectrogramGeneration(\n", - " clips=clips_personal,\n", - " augmenter=augmenter,\n", - " slide_frames=cfg[\"slide_frames\"],\n", - " step_ms=10,\n", - " )\n", - " RaggedMmap.from_generator(\n", - " out_dir=str(out_dir / \"wakeword_mmap\"),\n", - " sample_generator=spectros.spectrogram_generator(split=cfg[\"name\"], repeat=cfg[\"repetition\"]),\n", - " batch_size=100,\n", - " verbose=True,\n", - " )\n", - "\n", - "print(\"āœ… Features ready (generated_augmented_features/*/wakeword_mmap)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1pGuJDPyp3ax" - }, - "outputs": [], - "source": [ - "# Downloads pre-generated spectrogram features (made for microWakeWord in\n", - "# particular) for various negative datasets. This can be slow!\n", - "\n", - "import os\n", - "import requests\n", - "import zipfile\n", - "from pathlib import Path\n", - "from tqdm import tqdm\n", - "\n", - "# Function to download a file with progress bar\n", - "def download_file(url, output_path):\n", - " response = requests.get(url, stream=True)\n", - " total_size = int(response.headers.get('content-length', 0))\n", - " with open(output_path, \"wb\") as f, tqdm(\n", - " desc=f\"Downloading {output_path.name}\",\n", - " total=total_size,\n", - " unit=\"B\",\n", - " unit_scale=True,\n", - " unit_divisor=1024,\n", - " ) as bar:\n", - " for chunk in response.iter_content(chunk_size=1024):\n", - " f.write(chunk)\n", - " bar.update(len(chunk))\n", - " print(f\"Downloaded: {output_path}\")\n", - "\n", - "# Function to extract ZIP files\n", - "def extract_zip(zip_path, extract_to):\n", - " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", - " zip_ref.extractall(extract_to)\n", - " print(f\"Extracted: {zip_path} to {extract_to}\")\n", - "\n", - "# Directory for negative datasets\n", - "output_dir = Path('./negative_datasets')\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# Negative dataset URLs\n", - "link_root = \"https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main/\"\n", - "filenames = ['dinner_party.zip', 'dinner_party_eval.zip', 'no_speech.zip', 'speech.zip']\n", - "\n", - "# Download and extract files\n", - "for fname in filenames:\n", - " link = link_root + fname\n", - " zip_path = output_dir / fname\n", - "\n", - " # Download only if the file doesn't already exist\n", - " if not zip_path.exists():\n", - " try:\n", - " download_file(link, zip_path)\n", - " except Exception as e:\n", - " print(f\"Error downloading {fname}: {e}\")\n", - " continue\n", - "\n", - " # Extract the ZIP file\n", - " try:\n", - " extract_zip(zip_path, output_dir)\n", - " except Exception as e:\n", - " print(f\"Error extracting {fname}: {e}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ii1A14GsGVQT" - }, - "outputs": [], - "source": [ - "# --- Save a yaml config that controls the training process ---\n", - "\n", - "import os, sys, yaml\n", - "from pathlib import Path\n", - "\n", - "config = {}\n", - "\n", - "config[\"window_step_ms\"] = 10\n", - "config[\"train_dir\"] = \"trained_models/wakeword\"\n", - "\n", - "config[\"features\"] = [\n", - " {\"features_dir\":\"generated_augmented_features\",\"sampling_weight\":2.0,\"penalty_weight\":1.0,\"truth\":True,\"truncation_strategy\":\"truncate_start\",\"type\":\"mmap\"},\n", - " {\"features_dir\":\"negative_datasets/speech\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", - " {\"features_dir\":\"negative_datasets/dinner_party\",\"sampling_weight\":12.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", - " {\"features_dir\":\"negative_datasets/no_speech\",\"sampling_weight\":5.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"random\",\"type\":\"mmap\"},\n", - " {\"features_dir\":\"negative_datasets/dinner_party_eval\",\"sampling_weight\":0.0,\"penalty_weight\":1.0,\"truth\":False,\"truncation_strategy\":\"split\",\"type\":\"mmap\"},\n", - "]\n", - "\n", - "# Add personal features if they exist\n", - "if os.path.exists(\"personal_augmented_features/training\"):\n", - " config[\"features\"].insert(1, {\"features_dir\": \"personal_augmented_features\", \"sampling_weight\": 3.0, \"penalty_weight\": 1.0, \"truth\": True, \"truncation_strategy\": \"truncate_start\", \"type\": \"mmap\"})\n", - " print(\"āœ… Added personal features with higher weight (3.0)\")\n", - "\n", - "config[\"training_steps\"] = [40000]\n", - "config[\"positive_class_weight\"] = [1]\n", - "config[\"negative_class_weight\"] = [20]\n", - "config[\"learning_rates\"] = [0.001]\n", - "\n", - "# Smaller batch to avoid GPU copy/alloc failures on 3070 laptop VRAM\n", - "config[\"batch_size\"] = 16\n", - "\n", - "# SpecAugment off (as before)\n", - "config[\"time_mask_max_size\"] = [0]\n", - "config[\"time_mask_count\"] = [0]\n", - "config[\"freq_mask_max_size\"] = [0]\n", - "config[\"freq_mask_count\"] = [0]\n", - "\n", - "config[\"eval_step_interval\"] = 500\n", - "config[\"clip_duration_ms\"] = 1500\n", - "config[\"target_minimization\"] = 0.9\n", - "config[\"minimization_metric\"] = None\n", - "config[\"maximization_metric\"] = \"average_viable_recall\"\n", - "\n", - "with open(\"training_parameters.yaml\", \"w\") as f:\n", - " yaml.dump(config, f)\n", - "\n", - "print(\"āœ… Wrote training_parameters.yaml (batch_size=16)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WoEXJBaiC9mf" - }, - "outputs": [], - "source": [ - "# Train + export with GPU first, then automatic CPU fallback on GPU/VRAM errors\n", - "# (LIVE streaming output + full log capture for error detection)\n", - "# NOTE: Suppress ONLY the noisy \"Validation Batch #...\" lines (everything else still streams)\n", - "import os, sys, subprocess, textwrap\n", - "\n", - "# ---- Common TF env (applies to BOTH attempts) ----\n", - "base_env = os.environ.copy()\n", - "base_env.setdefault(\"TF_CPP_MIN_LOG_LEVEL\", \"2\")\n", - "base_env.setdefault(\"TF_XLA_FLAGS\", \"--tf_xla_auto_jit=0\") # disable XLA JIT (more stable mem)\n", - "base_env.setdefault(\"NVIDIA_TF32_OVERRIDE\", \"1\") # allow TF32 (perf/VRAM win on Ampere+)\n", - "\n", - "# These only matter when a GPU is visible:\n", - "base_env.setdefault(\"TF_FORCE_GPU_ALLOW_GROWTH\", \"true\")\n", - "base_env.setdefault(\"TF_GPU_ALLOCATOR\", \"cuda_malloc_async\")\n", - "# Optional (uncomment if you want a smaller cuDNN workspace):\n", - "# base_env.setdefault(\"TF_CUDNN_WORKSPACE_LIMIT_IN_MB\", \"256\")\n", - "\n", - "# ---- Training argv (same as your runpy args) ----\n", - "train_args = [\n", - " \"-m\", \"microwakeword.model_train_eval\",\n", - " \"--training_config\", \"training_parameters.yaml\",\n", - " \"--train\", \"1\",\n", - " \"--restore_checkpoint\", \"1\",\n", - " \"--test_tf_nonstreaming\", \"0\",\n", - " \"--test_tflite_nonstreaming\", \"0\",\n", - " \"--test_tflite_nonstreaming_quantized\", \"0\",\n", - " \"--test_tflite_streaming\", \"0\",\n", - " \"--test_tflite_streaming_quantized\", \"1\",\n", - " \"--use_weights\", \"best_weights\",\n", - " \"mixednet\",\n", - " \"--pointwise_filters\", \"64,64,64,64\",\n", - " \"--repeat_in_block\", \"1,1,1,1\",\n", - " \"--mixconv_kernel_sizes\", \"[5], [7,11], [9,15], [23]\",\n", - " \"--residual_connection\", \"0,0,0,0\",\n", - " \"--first_conv_filters\", \"32\",\n", - " \"--first_conv_kernel_size\", \"5\",\n", - " \"--stride\", \"2\",\n", - "]\n", - "\n", - "OOM_MARKERS = (\n", - " \"resourceexhaustederror\",\n", - " \"resource exhausted\",\n", - " \"oom\",\n", - " \"out of memory\",\n", - " \"cuda_error_out_of_memory\",\n", - " \"cudnn\",\n", - " \"failed to allocate\",\n", - " \"blas xgemm\",\n", - " \"cublas\",\n", - " \"internalerror: cuda\",\n", - " \"failed call to cuinit\",\n", - ")\n", - "\n", - "class RunResult:\n", - " def __init__(self, returncode: int, stdout: str):\n", - " self.returncode = returncode\n", - " self.stdout = stdout\n", - "\n", - "def run_training(label: str, extra_env: dict) -> RunResult:\n", - " env = base_env.copy()\n", - " env.update(extra_env or {})\n", - "\n", - " print(f\"\\nšŸš€ {label}\")\n", - " print(\"→\", \" \".join([sys.executable] + train_args))\n", - "\n", - " proc = subprocess.Popen(\n", - " [sys.executable] + train_args,\n", - " env=env,\n", - " text=True,\n", - " stdout=subprocess.PIPE,\n", - " stderr=subprocess.STDOUT,\n", - " bufsize=1, # line-buffered (best effort)\n", - " universal_newlines=True,\n", - " )\n", - "\n", - " full_log = []\n", - " try:\n", - " # Stream lines live AND capture them for OOM detection / error messages\n", - " assert proc.stdout is not None\n", - " for line in proc.stdout:\n", - " full_log.append(line)\n", - "\n", - " # Hide ONLY the per-minibatch validation spam\n", - " if line.startswith(\"Validation Batch #\"):\n", - " continue\n", - "\n", - " # Everything else streams live\n", - " print(line, end=\"\")\n", - " finally:\n", - " returncode = proc.wait()\n", - "\n", - " return RunResult(returncode, \"\".join(full_log))\n", - "\n", - "# Attempt 1: GPU (normal visibility)\n", - "cp = run_training(\n", - " \"Attempt 1/2: GPU training (with allow_growth + cuda_malloc_async)\",\n", - " extra_env={}, # no override\n", - ")\n", - "\n", - "if cp.returncode == 0:\n", - " print(\"āœ… Training and testing complete (GPU path).\")\n", - "else:\n", - " out_l = (cp.stdout or \"\").lower()\n", - " looks_like_gpu_oom = any(m in out_l for m in OOM_MARKERS)\n", - "\n", - " if looks_like_gpu_oom:\n", - " # Attempt 2: CPU fallback (hide GPUs completely)\n", - " cp2 = run_training(\n", - " \"Attempt 2/2: CPU fallback (GPU hidden via CUDA_VISIBLE_DEVICES='')\",\n", - " extra_env={\n", - " \"CUDA_VISIBLE_DEVICES\": \"\", # hard-disable GPU\n", - " # (Optional) makes TF less chatty about GPU init on some builds:\n", - " \"TF_CPP_MIN_LOG_LEVEL\": \"2\",\n", - " },\n", - " )\n", - " if cp2.returncode == 0:\n", - " print(\"āœ… Training and testing complete (CPU fallback).\")\n", - " else:\n", - " raise RuntimeError(\n", - " \"Training failed on BOTH GPU and CPU.\\n\\n\"\n", - " + textwrap.indent(cp2.stdout or \"(no output)\", prefix=\" \")\n", - " )\n", - " else:\n", - " # Not an OOM-style failure: surface the original error\n", - " raise RuntimeError(\n", - " \"Training failed (does not look like a VRAM/OOM issue).\\n\\n\"\n", - " + textwrap.indent(cp.stdout or \"(no output)\", prefix=\" \")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ex_UIWvwtjAN" - }, - "outputs": [], - "source": [ - "import shutil\n", - "import json\n", - "from IPython.display import display, HTML\n", - "\n", - "# Use the wake word from Cell 3\n", - "wake_word = TARGET_WORD\n", - "\n", - "# --- Copy TFLite file to working dir with wake word name ---\n", - "source_path = \"trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite\"\n", - "tflite_filename = f\"{wake_word}.tflite\"\n", - "tflite_path = f\"./{tflite_filename}\"\n", - "shutil.copy(source_path, tflite_path)\n", - "\n", - "# --- Write JSON metadata file with matching model name ---\n", - "json_data = {\n", - " \"type\": \"micro\",\n", - " \"wake_word\": wake_word,\n", - " \"author\": \"Tater Totterson\",\n", - " \"website\": \"https://github.com/TaterTotterson/microWakeWord-Trainer-Nvidia-Docker.git\",\n", - " \"model\": tflite_filename,\n", - " \"trained_languages\": [\"en\"],\n", - " \"version\": 2,\n", - " \"micro\": {\n", - " \"probability_cutoff\": 0.97,\n", - " \"sliding_window_size\": 5,\n", - " \"feature_step_size\": 10,\n", - " \"tensor_arena_size\": 30000,\n", - " \"minimum_esphome_version\": \"2024.7.0\"\n", - " }\n", - "}\n", - "json_filename = f\"{wake_word}.json\"\n", - "json_path = f\"./{json_filename}\"\n", - "with open(json_path, \"w\") as json_file:\n", - " json.dump(json_data, json_file, indent=2)\n", - "\n", - "# --- Display nice download links ---\n", - "html = f\"\"\"\n", - "

Download your files:

\n", - "\n", - "\"\"\"\n", - "display(HTML(html))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/mmw.png b/mmw.png deleted file mode 100644 index ba38f86..0000000 Binary files a/mmw.png and /dev/null differ diff --git a/recorder_server.py b/recorder_server.py new file mode 100644 index 0000000..e469104 --- /dev/null +++ b/recorder_server.py @@ -0,0 +1,593 @@ +# recorder_server.py +import os +import re +import subprocess +import threading +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple + +from fastapi import FastAPI, UploadFile, File, Form, Query +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.staticfiles import StaticFiles + +ROOT_DIR = Path(__file__).resolve().parent + +# In Docker CLI world, DATA_DIR should be /data +DATA_DIR = Path(os.environ.get("DATA_DIR", "/data")).resolve() + +# UI files live next to this script by default +STATIC_DIR = Path(os.environ.get("STATIC_DIR", str(ROOT_DIR / "static"))).resolve() + +# Personal samples MUST land in /data/personal_samples for your CLI pipeline +PERSONAL_DIR = Path(os.environ.get("PERSONAL_DIR", str(DATA_DIR / "personal_samples"))).resolve() + +# CLI folder inside repo +CLI_DIR = Path(os.environ.get("CLI_DIR", str(ROOT_DIR / "cli"))).resolve() + +# If you want cleanup defaults for auto dataset setup, set these env vars: +# REC_DATASET_CLEANUP_ARCHIVES=true/false +# REC_DATASET_CLEANUP_INTERMEDIATE_FILES=true/false +DATASET_CLEANUP_ARCHIVES = os.environ.get("REC_DATASET_CLEANUP_ARCHIVES", "false").lower() in ("1", "true", "yes", "y") +DATASET_CLEANUP_INTERMEDIATE = os.environ.get("REC_DATASET_CLEANUP_INTERMEDIATE_FILES", "false").lower() in ("1", "true", "yes", "y") + +# We want "Start training" to trigger your CLI entrypoint, using the existing venv +# (train_wake_word should be in /data/.venv/bin via setup_python_venv) +TRAIN_CMD = os.environ.get( + "TRAIN_CMD", + f"source '{DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir '{DATA_DIR}'" +) + +TAKES_PER_SPEAKER_DEFAULT = int(os.environ.get("REC_TAKES_PER_SPEAKER", "10")) +SPEAKERS_TOTAL_DEFAULT = int(os.environ.get("REC_SPEAKERS_TOTAL", "1")) + +# How many lines to show in WebUI (tail) +TRAIN_LOG_TAIL_LINES = int(os.environ.get("REC_TRAIN_LOG_TAIL_LINES", "400")) +# If you prefer bytes-based tailing (fast), keep this non-zero. +TRAIN_LOG_MAX_BYTES = int(os.environ.get("REC_TRAIN_LOG_MAX_BYTES", str(512 * 1024))) # 512KB + +app = FastAPI(title="microWakeWord Personal Recorder") + +# Serve static UI +STATIC_DIR.mkdir(parents=True, exist_ok=True) +app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + + +def safe_name(raw: str) -> str: + s = (raw or "").strip().lower() + s = re.sub(r"\s+", "_", s) + s = re.sub(r"[^a-z0-9_]+", "", s) + s = re.sub(r"^_+|_+$", "", s) + return s or "wakeword" + + +# -------------------- In-memory session state -------------------- +STATE: Dict[str, Any] = { + "raw_phrase": None, + "safe_word": None, + + "speakers_total": SPEAKERS_TOTAL_DEFAULT, + "takes_per_speaker": TAKES_PER_SPEAKER_DEFAULT, + + "takes_received": 0, + "takes": [], + + "training": { + "running": False, + "exit_code": None, + "log_lines": [], # legacy in-memory tail (still maintained) + "log_path": None, # path to recorder_training.log + "safe_word": None, + + # NEW: byte offset for efficient log tailing + "log_offset": 0, + }, +} + +STATE_LOCK = threading.Lock() + + +def _reset_personal_samples_dir(): + PERSONAL_DIR.mkdir(parents=True, exist_ok=True) + for p in PERSONAL_DIR.glob("*.wav"): + try: + p.unlink() + except Exception: + pass + + +def _append_train_log(line: str): + line = (line or "").rstrip("\n") + with STATE_LOCK: + buf: List[str] = STATE["training"]["log_lines"] + buf.append(line) + if len(buf) > 250: + del buf[: (len(buf) - 250)] + + +def _title_from_phrase(raw_phrase: str) -> str: + # Keep it human-friendly for the optional argument + s = re.sub(r"[^a-zA-Z0-9 ]+", " ", raw_phrase or "").strip() + s = re.sub(r"\s+", " ", s) + return s.title() if s else "" + + +def _run_streamed( + cmd: List[str], + cwd: Path, + log_path: Path, + header: Optional[str] = None, + env: Optional[Dict[str, str]] = None, +) -> int: + """ + Run a command streaming stdout/stderr to both: + - recorder_training.log (disk) + - STATE["training"]["log_lines"] (UI) [best-effort] + Returns process exit code. + """ + if header: + _append_train_log(header) + + _append_train_log("→ " + " ".join(cmd)) + + with open(log_path, "a", encoding="utf-8") as lf: + lf.write("\n" + ("=" * 80) + "\n") + if header: + lf.write(header + "\n") + lf.write("→ " + " ".join(cmd) + "\n") + lf.flush() + + proc = subprocess.Popen( + cmd, + cwd=str(cwd), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + + assert proc.stdout is not None + for line in proc.stdout: + lf.write(line) + lf.flush() + _append_train_log(line) + + return proc.wait() + + +def _ensure_training_venv(log_path: Path) -> None: + """ + Ensure /data/.venv exists by running cli/setup_python_venv if needed. + """ + activate = DATA_DIR / ".venv" / "bin" / "activate" + if activate.exists(): + _append_train_log("āœ… Training venv found (skipping setup_python_venv)") + return + + setup = CLI_DIR / "setup_python_venv" + if not setup.exists(): + raise RuntimeError(f"Missing setup_python_venv at: {setup}") + + rc = _run_streamed( + ["bash", "-lc", f"cd '{DATA_DIR}' && '{setup}' --data-dir='{DATA_DIR}'"], + cwd=DATA_DIR, + log_path=log_path, + header="===== Ensuring Python venv (/data/.venv) =====", + ) + + if rc != 0: + raise RuntimeError(f"setup_python_venv failed (exit_code={rc})") + + if not activate.exists(): + raise RuntimeError(f"setup_python_venv finished, but {activate} is still missing") + + +def _ensure_training_datasets(log_path: Path) -> None: + """ + Always run setup_training_datasets before training. + The underlying scripts should skip work when already done. + """ + setup = CLI_DIR / "setup_training_datasets" + if not setup.exists(): + raise RuntimeError(f"Missing setup_training_datasets at: {setup}") + + cleanup_arch = "true" if DATASET_CLEANUP_ARCHIVES else "false" + cleanup_inter = "true" if DATASET_CLEANUP_INTERMEDIATE else "false" + + cmd = [ + "bash", + "-lc", + ( + f"cd '{DATA_DIR}' && " + f"'{setup}' " + f"--cleanup-archives='{cleanup_arch}' " + f"--cleanup-intermediate-files='{cleanup_inter}' " + f"--data-dir='{DATA_DIR}'" + ), + ] + + rc = _run_streamed( + cmd, + cwd=DATA_DIR, + log_path=log_path, + header="===== Ensuring training datasets (setup_training_datasets) =====", + ) + + if rc != 0: + raise RuntimeError(f"setup_training_datasets failed (exit_code={rc})") + + +def _read_log_tail_by_bytes(log_path: Path, max_bytes: int) -> str: + """ + Read up to the last max_bytes from a file (UTF-8 best effort). + """ + if not log_path.exists(): + return "" + + try: + size = log_path.stat().st_size + start = max(0, size - max_bytes) + with open(log_path, "rb") as f: + f.seek(start) + data = f.read() + # If we started in the middle of a line, it's ok; UI will show partial. + return data.decode("utf-8", errors="replace") + except Exception: + return "" + + +def _read_log_tail_by_lines(log_path: Path, max_lines: int) -> str: + """ + Read last N lines of a file (simple, may be slower on huge files). + """ + if not log_path.exists(): + return "" + try: + # Read by bytes limited first, then line-tail + raw = _read_log_tail_by_bytes(log_path, TRAIN_LOG_MAX_BYTES) + if not raw: + return "" + lines = raw.splitlines() + if len(lines) <= max_lines: + return "\n".join(lines) + return "\n".join(lines[-max_lines:]) + except Exception: + return "" + + +def _read_log_since_offset(log_path: Path, offset: int, max_bytes: int = 256 * 1024) -> Tuple[str, int]: + """ + Read log file incrementally starting from `offset`. + Returns (new_text, new_offset). Caps bytes read per call. + """ + if not log_path.exists(): + return ("", offset) + + try: + size = log_path.stat().st_size + # If file rotated/truncated, reset offset + if offset > size: + offset = 0 + + with open(log_path, "rb") as f: + f.seek(offset) + data = f.read(max_bytes) + + new_offset = offset + len(data) + text = data.decode("utf-8", errors="replace") + return (text, new_offset) + except Exception: + return ("", offset) + + +def _run_training_background(safe_word: str, allow_no_personal: bool): + with STATE_LOCK: + raw_phrase = STATE.get("raw_phrase") or "" + + wake_word_title = _title_from_phrase(raw_phrase) + + with STATE_LOCK: + STATE["training"]["running"] = True + STATE["training"]["exit_code"] = None + STATE["training"]["log_lines"] = [] + STATE["training"]["safe_word"] = safe_word + log_path = Path(str(DATA_DIR / "recorder_training.log")) + STATE["training"]["log_path"] = str(log_path) + STATE["training"]["log_offset"] = 0 + + # fresh header at the start of a run + _append_train_log("================================================================================") + _append_train_log("===== Recorder Training Run =====") + _append_train_log("================================================================================") + + # Ensure the log exists and starts cleanly with a header separator for this run + try: + with open(log_path, "a", encoding="utf-8") as lf: + lf.write("\n" + ("=" * 80) + "\n") + lf.write("===== Recorder Training Run =====\n") + lf.write(("=" * 80) + "\n") + lf.flush() + except Exception: + pass + + try: + # 1) Ensure venv (auto-installs) + _ensure_training_venv(log_path) + + # 2) Ensure datasets (auto-installs / skips if already present) + _ensure_training_datasets(log_path) + + # 3) Run training + if wake_word_title: + cmd_str = f"{TRAIN_CMD} '{safe_word}' '{wake_word_title}'" + else: + cmd_str = f"{TRAIN_CMD} '{safe_word}'" + + env = os.environ.copy() + env["MWW_ALLOW_NO_PERSONAL"] = "true" if allow_no_personal else "false" + + _append_train_log("===== Training (train_wake_word) =====") + _append_train_log(f"→ Running: {cmd_str}") + + with open(log_path, "a", encoding="utf-8") as lf: + proc = subprocess.Popen( + ["bash", "-lc", cmd_str], + cwd=str(DATA_DIR), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + assert proc.stdout is not None + for line in proc.stdout: + lf.write(line) + lf.flush() + _append_train_log(line) + + rc = proc.wait() + + _append_train_log(f"āœ“ Training finished (exit_code={rc})") + with STATE_LOCK: + STATE["training"]["exit_code"] = rc + + except Exception as e: + _append_train_log(f"āœ— Training crashed: {e!r}") + with STATE_LOCK: + STATE["training"]["exit_code"] = 999 + + finally: + with STATE_LOCK: + STATE["training"]["running"] = False + + +# -------------------- Routes -------------------- +@app.get("/", response_class=HTMLResponse) +def index(): + html_path = STATIC_DIR / "index.html" + if not html_path.exists(): + return HTMLResponse( + "

Missing UI

Create static/index.html.

", + status_code=500, + ) + return HTMLResponse(html_path.read_text(encoding="utf-8")) + + +@app.post("/api/start_session") +def start_session(payload: Dict[str, Any]): + raw = (payload.get("phrase") or "").strip() + if not raw: + return JSONResponse({"ok": False, "error": "phrase is required"}, status_code=400) + + safe = safe_name(raw) + + speakers_total = int(payload.get("speakers_total") or SPEAKERS_TOTAL_DEFAULT) + takes_per_speaker = int(payload.get("takes_per_speaker") or TAKES_PER_SPEAKER_DEFAULT) + + speakers_total = max(1, min(10, speakers_total)) + takes_per_speaker = max(1, min(50, takes_per_speaker)) + + with STATE_LOCK: + STATE["raw_phrase"] = raw + STATE["safe_word"] = safe + STATE["speakers_total"] = speakers_total + STATE["takes_per_speaker"] = takes_per_speaker + STATE["takes_received"] = 0 + STATE["takes"] = [] + # do not interrupt training if running + + _reset_personal_samples_dir() + + return { + "ok": True, + "raw_phrase": raw, + "safe_word": safe, + "speakers_total": speakers_total, + "takes_per_speaker": takes_per_speaker, + "takes_total": speakers_total * takes_per_speaker, + "personal_dir": str(PERSONAL_DIR), + "data_dir": str(DATA_DIR), + } + + +@app.get("/api/session") +def get_session(): + with STATE_LOCK: + return { + "ok": True, + "raw_phrase": STATE["raw_phrase"], + "safe_word": STATE["safe_word"], + "speakers_total": STATE["speakers_total"], + "takes_per_speaker": STATE["takes_per_speaker"], + "takes_received": STATE["takes_received"], + "takes": list(STATE["takes"]), + "training": dict(STATE["training"]), + } + + +@app.post("/api/upload_take") +async def upload_take( + speaker_index: int = Form(...), + take_index: int = Form(...), + file: UploadFile = File(...), +): + with STATE_LOCK: + safe_word = STATE["safe_word"] + speakers_total = int(STATE["speakers_total"]) + takes_per_speaker = int(STATE["takes_per_speaker"]) + + if not safe_word: + return JSONResponse({"ok": False, "error": "No active session. Call /api/start_session first."}, status_code=400) + + if speaker_index < 1 or speaker_index > speakers_total: + return JSONResponse({"ok": False, "error": f"speaker_index must be 1..{speakers_total}"}, status_code=400) + + if take_index < 1 or take_index > takes_per_speaker: + return JSONResponse({"ok": False, "error": f"take_index must be 1..{takes_per_speaker}"}, status_code=400) + + PERSONAL_DIR.mkdir(parents=True, exist_ok=True) + + out_name = f"speaker{speaker_index:02d}_take{take_index:02d}.wav" + out_path = PERSONAL_DIR / out_name + + data = await file.read() + if not data or len(data) < 44: + return JSONResponse({"ok": False, "error": "Empty/invalid file"}, status_code=400) + + out_path.write_bytes(data) + + with STATE_LOCK: + if out_name not in STATE["takes"]: + STATE["takes"].append(out_name) + STATE["takes_received"] = len(STATE["takes"]) + + return {"ok": True, "saved_as": out_name, "takes_received": STATE["takes_received"]} + + +@app.post("/api/train") +def train_now(payload: Dict[str, Any] = None): + payload = payload or {} + allow_no_personal = bool(payload.get("allow_no_personal", False)) + + with STATE_LOCK: + safe_word = STATE["safe_word"] + takes_received = int(STATE["takes_received"]) + speakers_total = int(STATE["speakers_total"]) + takes_per_speaker = int(STATE["takes_per_speaker"]) + training_running = bool(STATE["training"]["running"]) + + takes_total = speakers_total * takes_per_speaker + + if training_running: + return JSONResponse({"ok": False, "error": "Training already running"}, status_code=400) + + if not safe_word: + return JSONResponse({"ok": False, "error": "No active session"}, status_code=400) + + min_required = max(1, min(3, takes_total)) + + if takes_received == 0 and not allow_no_personal: + return JSONResponse( + { + "ok": False, + "error": f"No personal voice samples recorded (0/{takes_total}).", + "code": "NO_PERSONAL_SAMPLES", + "message": "You can train without personal voices, or record samples first.", + "takes_total": takes_total, + }, + status_code=400, + ) + + if 0 < takes_received < min_required: + return JSONResponse( + { + "ok": False, + "error": f"Not enough takes yet ({takes_received}/{takes_total}).", + "code": "NOT_ENOUGH_TAKES", + "min_required": min_required, + "takes_total": takes_total, + }, + status_code=400, + ) + + t = threading.Thread(target=_run_training_background, args=(safe_word, allow_no_personal), daemon=True) + t.start() + + return { + "ok": True, + "started": True, + "safe_word": safe_word, + "personal_samples_used": takes_received >= min_required, + "allow_no_personal": allow_no_personal, + } + + +@app.get("/api/train_status") +def train_status( + offset: int = Query(0, ge=0), + max_bytes: int = Query(65536, ge=1024, le=262144), + last_size: int = Query(0, ge=0), + last_mtime: float = Query(0.0, ge=0.0), +): + """ + Stream training output from the log file on disk. + + Robust to log overwrite/truncation: + - UI passes offset + last_size + last_mtime + - If file shrinks or mtime goes backwards/changes weirdly, reset offset to 0 + """ + with STATE_LOCK: + tr = dict(STATE["training"]) + log_path_str = tr.get("log_path") + + log_text = "" + next_offset = offset + log_size = 0 + log_mtime = 0.0 + + if log_path_str: + p = Path(log_path_str) + if p.exists(): + try: + st = p.stat() + log_size = int(st.st_size) + log_mtime = float(st.st_mtime) + + # Detect overwrite/truncate/reset: + # - file shrank + # - file mtime moved "backwards" (rare) or changed while size reset + # If anything indicates a reset, restart from beginning. + if (log_size < last_size) or (last_mtime and log_mtime < last_mtime): + offset = 0 + + # Clamp offset to current file size + if offset > log_size: + offset = log_size + + # Read incrementally from the file + with p.open("rb") as f: + f.seek(offset) + chunk = f.read(max_bytes) + + log_text = chunk.decode("utf-8", errors="replace") + next_offset = offset + len(chunk) + + except Exception as e: + log_text = f"\n[log read error: {e!r}]\n" + next_offset = offset + + tr["log_text"] = log_text + tr["next_offset"] = next_offset + tr["log_size"] = log_size + tr["log_mtime"] = log_mtime + + return {"ok": True, "training": tr} + + +@app.post("/api/reset_recordings") +def reset_recordings(): + _reset_personal_samples_dir() + with STATE_LOCK: + STATE["takes_received"] = 0 + STATE["takes"] = [] + return {"ok": True} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 438e424..a0e801b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,10 @@ -# --- Core training (Microwakeword) --- +# --- Packages needed by our scripts --- numpy==1.26.4 scipy==1.12.0 librosa==0.10.2.post1 soundfile==0.12.1 -soxr==0.5.0.post1 -audiomentations==0.38.0 -webrtcvad==2.0.10 tqdm==4.67.1 scikit-learn==1.6.0 -numba==0.60.0 -joblib==1.4.2 -pandas==2.2.3 -pymicro_features @ git+https://github.com/puddly/pymicro-features@e1d3f88183e12bb8af2df9e399ea157af7393762 -audio-metadata @ git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f -bitstruct==8.19.0 - -# --- Piper sample generation --- -piper-tts>=1.2.0 -piper-phonemize-cross==1.2.1 - -# --- Notebook / tooling --- -ipykernel==6.29.5 -jupyterlab==4.3.4 -ipywidgets==8.1.5 -matplotlib-inline==0.1.7 -rich==13.9.4 +numba==0.63.1 +PyYAML==6.0.3 diff --git a/run_recorder.sh b/run_recorder.sh new file mode 100644 index 0000000..9ac94c5 --- /dev/null +++ b/run_recorder.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOTDIR="$(dirname "$(realpath "$0")")" + +# Training convention +DATA_DIR="${DATA_DIR:-/data}" +HOST="${REC_HOST:-0.0.0.0}" +PORT="${REC_PORT:-8888}" + +# Keep recorder deps separate from training venv +VENV_DIR="${DATA_DIR}/.recorder-venv" +PY="${VENV_DIR}/bin/python" +PIP="${PY} -m pip" +PIN_FILE="${VENV_DIR}/.pinned_installed" + +FASTAPI_VERSION="${REC_FASTAPI_VERSION:-0.115.6}" +UVICORN_VERSION="${REC_UVICORN_VERSION:-0.30.6}" +PY_MULTIPART_VERSION="${REC_PY_MULTIPART_VERSION:-0.0.9}" + +echo "microWakeWord Recorder (Docker)" +echo "-> ROOTDIR: ${ROOTDIR}" +echo "-> DATA_DIR: ${DATA_DIR}" +echo "-> URL: http://localhost:${PORT}/" + +mkdir -p "${DATA_DIR}" + +# ----------------------------- +# Recorder venv (separate) +# ----------------------------- +if [[ ! -x "${PY}" ]]; then + echo "Creating recorder venv: ${VENV_DIR}" + python3 -m venv "${VENV_DIR}" +fi + +# shellcheck disable=SC1091 +source "${VENV_DIR}/bin/activate" + +if [[ ! -f "${PIN_FILE}" ]]; then + echo "Installing pinned recorder deps" + ${PIP} install -U pip setuptools wheel + ${PIP} install \ + "fastapi==${FASTAPI_VERSION}" \ + "uvicorn[standard]==${UVICORN_VERSION}" \ + "python-multipart==${PY_MULTIPART_VERSION}" + touch "${PIN_FILE}" +else + echo "Reusing existing recorder venv (no upgrades)" +fi + +# ----------------------------- +# Recorder server env +# ----------------------------- +export DATA_DIR="${DATA_DIR}" +export STATIC_DIR="${ROOTDIR}/static" +export PERSONAL_DIR="${DATA_DIR}/personal_samples" + +# IMPORTANT: leave training venv creation to /api/train inside recorder_server.py +# but still set TRAIN_CMD so the server knows how to invoke training once ready +export TRAIN_CMD="source '${DATA_DIR}/.venv/bin/activate' && train_wake_word --data-dir='${DATA_DIR}'" + +echo "Launching uvicorn on ${HOST}:${PORT}" +cd "${ROOTDIR}" +exec "${VENV_DIR}/bin/uvicorn" recorder_server:app --host "${HOST}" --port "${PORT}" \ No newline at end of file diff --git a/startup.sh b/startup.sh deleted file mode 100644 index bb4f3e2..0000000 --- a/startup.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -: "${NB_UID:=0}" -: "${NB_GID:=0}" -umask 002 - -NOTEBOOK_SRC="/root/microWakeWord_training_notebook.ipynb" -NOTEBOOK_DST="/data/microWakeWord_training_notebook.ipynb" - -mkdir -p /data /data/generated_samples /data/personal_samples - -if [[ ! -f "$NOTEBOOK_DST" ]]; then - echo "No training notebook found in /data; copying default…" - cp -n "$NOTEBOOK_SRC" "$NOTEBOOK_DST" -fi - -# Try to align ownership for convenience (ignore errors if not permitted) -if [[ "$NB_UID" != "0" || "$NB_GID" != "0" ]]; then - chown -R "$NB_UID:$NB_GID" /data || true -fi - -exec "$@" \ No newline at end of file diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..cb77038 --- /dev/null +++ b/static/index.html @@ -0,0 +1,782 @@ + + + + + + microWakeWord Recorder + + + + +
+
+
+ +
+

šŸŽ™ļø microWakeWord Personal Recorder

+

Enter a wake word, test TTS pronunciation, then record takes. Recording starts when you speak and stops after silence.

+
+
+
+ +
+
+ + + + No session +
+ +
+ + + Speaker: - +
+ +
+ Advanced (if it’s too sensitive / not sensitive enough) +
+ + + +
+
+
+ +
+
+ + + + Idle +
+ +
+
+ Mic level +
+ +
+ +

+ Speaker: - / - + Waiting +

+ +

+ Take: 0 / 10 + Not recording +

+ +
+ +

Training log

+
(no training started)
+
+
+ + + + \ No newline at end of file diff --git a/cli/train_wake_word b/train_wake_word old mode 100755 new mode 100644 similarity index 83% rename from cli/train_wake_word rename to train_wake_word index b52adcf..4faed61 --- a/cli/train_wake_word +++ b/train_wake_word @@ -3,9 +3,10 @@ set -e PROGPATH=$(realpath "$0") PROGDIR=$(dirname "${PROGPATH}") +CLIDIR="${PROGDIR}/cli" KNOWN_ARGS=( samples batch-size training-steps data-dir cleanup-work-dir ) -source "${PROGDIR}/shell.functions" +source "${CLIDIR}/shell.functions" WAKE_WORD=${POSITIONAL_ARGS[0]} if [ ${#UNKNOWN_ARGS[@]} -gt 0 ] ; then @@ -62,7 +63,7 @@ fi printf "%-80s\n" "=" | tr ' ' "=" echo "===== Running '${WAKE_WORD}(${WAKE_WORD_TITLE})' generation, augmentation and training =====" -"${PROGDIR}/cudainfo" +"${CLIDIR}/cudainfo" echo START_TS=$EPOCHSECONDS @@ -75,17 +76,13 @@ export TF_CUDNN_WORKSPACE_LIMIT_IN_MB=512 export GLOG_minloglevel=2 export GRPC_VERBOSITY=ERROR - -"${PROGDIR}/wake_word_sample_generator" \ +"${CLIDIR}/wake_word_sample_generator" \ --samples=${SAMPLES} \ --batch-size=${BATCH_SIZE} \ --data-dir="${DATA_DIR}" "${WAKE_WORD}" POST_GEN_TS=$EPOCHSECONDS -ww="${WAKE_WORD// /_}" -ww="${ww//./}" - AUGMENT=false GENERATED_DIR="${DATA_DIR}/work/wake_word_samples" AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented" @@ -96,7 +93,7 @@ AUGMENTED_DIR="${DATA_DIR}/work/wake_word_samples_augmented" if ${AUGMENT} ; then rm -rf "${AUGMENTED_DIR}" || : mkdir -p "${AUGMENTED_DIR}" || : - "${PROGDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; } + "${CLIDIR}/wake_word_sample_augmenter" --data-dir="${DATA_DIR}" || { rm -rf "${AUGMENTED_DIR}" ; exit 1 ; } else echo "Augmentation not required" echo @@ -104,22 +101,30 @@ fi POST_AUGMENT_TS=$EPOCHSECONDS -"${PROGDIR}/wake_word_sample_trainer" --samples=${SAMPLES} --training-steps=${TRAINING_STEPS} --data-dir="${DATA_DIR}" \ - "${WAKE_WORD}" "${WAKE_WORD_TITLE}" +"${CLIDIR}/wake_word_sample_trainer" \ + --samples=${SAMPLES} \ + --training-steps=${TRAINING_STEPS} \ + --data-dir="${DATA_DIR}" \ + "${WAKE_WORD}" "${WAKE_WORD_TITLE}" if ${CLEANUP_WORK_DIR} ; then - rm -rf "${DATA_DIR}/work/trained_models" "${DATA_DIR}/work/wake_word_samples" \ - "${DATA_DIR}/work/wake_word_samples_augmented" "${DATA_DIR}/work/last_wake_word" || : + rm -rf \ + "${DATA_DIR}/work/trained_models" \ + "${DATA_DIR}/work/wake_word_samples" \ + "${DATA_DIR}/work/wake_word_samples_augmented" \ + "${DATA_DIR}/work/personal_augmented_features" \ + "${DATA_DIR}/work/last_wake_word" || : fi + END_TS=$EPOCHSECONDS python -c $'print(f"{\'=\' * 80}")' printf "%44s\n\n" "Training Summary" -"${PROGDIR}/system_summary" +"${CLIDIR}/system_summary" echo print_elapsed_time --no-separators "${START_TS}" "${POST_GEN_TS}" "Generate ${SAMPLES} samples, ${BATCH_SIZE}/batch" print_elapsed_time --no-separators "${POST_GEN_TS}" "${POST_AUGMENT_TS}" "Augment ${SAMPLES} samples" print_elapsed_time --no-separators "${POST_AUGMENT_TS}" "${END_TS}" "${TRAINING_STEPS} training steps" python -c $'msg="="*54 ; print(f"{msg:>80s}")' print_elapsed_time --no-separators "${START_TS}" "${END_TS}" "Total" -python -c $'print(f"{\'=\' * 80}")' +python -c $'print(f"{\'=\' * 80}")' \ No newline at end of file