Add VAD trimming and Docker publishing

This commit is contained in:
MasterPhooey
2026-05-16 00:32:05 -05:00
parent 134f607bef
commit 196ab8c0e7
5 changed files with 914 additions and 50 deletions

48
.github/workflows/docker-publish.yml vendored Normal file
View File

@@ -0,0 +1,48 @@
name: Publish Docker Image
on:
push:
branches:
- main
workflow_dispatch:
permissions:
contents: read
packages: write
concurrency:
group: docker-publish-${{ github.ref }}
cancel-in-progress: true
env:
REGISTRY: ghcr.io
IMAGE_NAME: tatertotterson/microwakeword-trainer-nvidia-docker
jobs:
docker:
name: Docker image
runs-on: ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push image
uses: docker/build-push-action@v6
with:
context: .
file: dockerfile
platforms: linux/amd64
push: true
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
cache-from: type=gha,scope=mww-trainer-nvidia-docker
cache-to: type=gha,mode=max,scope=mww-trainer-nvidia-docker

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
personal_samples/*
data/
trim_history/
.DS_Store

66
run.sh
View File

@@ -26,6 +26,16 @@ echo "-> URL: http://localhost:${PORT}/"
mkdir -p "${DATA_DIR}"
install_ui_deps() {
${PIP} install \
"fastapi==${FASTAPI_VERSION}" \
"uvicorn[standard]==${UVICORN_VERSION}" \
"python-multipart==${PY_MULTIPART_VERSION}" \
"esphome==${ESPHOME_VERSION}" \
"silero-vad>=5.0.0" \
"numpy>=1.24.0"
}
# -----------------------------
# Trainer UI venv (separate)
# -----------------------------
@@ -40,32 +50,54 @@ source "${VENV_DIR}/bin/activate"
if [[ ! -f "${PIN_FILE}" ]]; then
echo "Installing pinned trainer UI deps"
${PIP} install -U pip setuptools wheel
${PIP} install \
"fastapi==${FASTAPI_VERSION}" \
"uvicorn[standard]==${UVICORN_VERSION}" \
"python-multipart==${PY_MULTIPART_VERSION}" \
"esphome==${ESPHOME_VERSION}"
install_ui_deps
touch "${PIN_FILE}"
else
echo "Reusing existing trainer UI venv (no upgrades)"
if ! "${PY}" - "${ESPHOME_VERSION}" <<'PY' >/dev/null 2>&1
import importlib.metadata
if ! "${PY}" - "${FASTAPI_VERSION}" "${UVICORN_VERSION}" "${PY_MULTIPART_VERSION}" "${ESPHOME_VERSION}" <<'PY' >/dev/null 2>&1
import importlib.metadata as md
import sys
expected = sys.argv[1]
installed = importlib.metadata.version("esphome")
raise SystemExit(0 if installed == expected else 1)
fastapi_version, uvicorn_version, multipart_version, esphome_version = sys.argv[1:5]
def version_tuple(value):
parts = []
for token in str(value).replace("-", ".").split("."):
if token.isdigit():
parts.append(int(token))
else:
digits = "".join(ch for ch in token if ch.isdigit())
if digits:
parts.append(int(digits))
break
return tuple(parts)
exact = {
"fastapi": fastapi_version,
"uvicorn": uvicorn_version,
"python-multipart": multipart_version,
"esphome": esphome_version,
}
minimum = {
"silero-vad": "5.0.0",
"numpy": "1.24.0",
}
present = ("torch", "zeroconf")
for package, expected in exact.items():
if md.version(package) != expected:
raise SystemExit(1)
for package, minimum_version in minimum.items():
if version_tuple(md.version(package)) < version_tuple(minimum_version):
raise SystemExit(1)
for package in present:
md.version(package)
PY
then
echo "Firmware tab dependencies missing or stale; installing ESPHome firmware dependencies"
${PIP} install \
"fastapi==${FASTAPI_VERSION}" \
"uvicorn[standard]==${UVICORN_VERSION}" \
"python-multipart==${PY_MULTIPART_VERSION}" \
"esphome==${ESPHOME_VERSION}"
echo "UI dependencies missing or stale; installing recorder dependencies"
install_ui_deps
fi
fi
# -----------------------------
# Trainer server env
# -----------------------------

View File

@@ -622,6 +622,67 @@
color: var(--orange2);
}
.paginationControls {
display: flex;
align-items: center;
justify-content: center;
gap: 16px;
padding: 12px 0;
}
.paginationControls .pageBtn {
padding: 6px 14px;
border-radius: 6px;
font-size: 13px;
background: rgba(255,255,255,0.06);
border: 1px solid rgba(255,255,255,0.12);
color: var(--text, #fff);
cursor: pointer;
}
.paginationControls .pageBtn:disabled {
opacity: 0.3;
cursor: default;
}
.paginationControls .pageInfo {
font-size: 13px;
color: var(--muted, #888);
}
.paginationControls .pageJump {
font-size: 13px;
color: var(--muted, #888);
display: flex;
align-items: center;
gap: 4px;
}
.paginationControls .pageInput {
width: 40px;
padding: 4px 6px;
font-size: 13px;
text-align: center;
background: rgba(255,255,255,0.06);
border: 1px solid rgba(255,255,255,0.12);
border-radius: 4px;
color: var(--text, #fff);
}
.paginationControls .pageJumpBtn {
padding: 4px 10px;
font-size: 13px;
background: rgba(255,255,255,0.1);
border: 1px solid rgba(255,255,255,0.15);
border-radius: 4px;
color: var(--text, #fff);
cursor: pointer;
}
.paginationControls .pageJumpBtn:hover {
background: rgba(255,255,255,0.18);
}
.tabs {
display: flex;
flex-wrap: wrap;
@@ -960,6 +1021,78 @@
100% { transform: translateY(0) scale(1) rotate(0deg); }
}
.trimOverlay {
position: fixed; inset: 0; padding: 22px;
display: flex; align-items: center; justify-content: center;
background: rgba(4,5,10,0.6); backdrop-filter: blur(10px);
opacity: 0; visibility: hidden; pointer-events: none;
transition: opacity 0.18s ease, visibility 0.18s ease;
z-index: 11000;
}
.trimOverlay.open { opacity: 1; visibility: visible; pointer-events: auto; }
.trimDialog {
width: min(960px, calc(100vw - 36px));
max-height: min(90vh, 900px);
display: grid; grid-template-rows: auto 1fr auto auto auto; gap: 12px;
padding: 18px; border-radius: 22px;
border: 1px solid rgba(255,255,255,0.12);
background: linear-gradient(180deg, rgba(17,20,28,0.82), rgba(8,10,16,0.94));
box-shadow: 0 28px 84px rgba(0,0,0,0.58);
backdrop-filter: blur(18px) saturate(1.12);
}
.trimHeader { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; }
.trimTitle { margin: 0; font-size: 18px; }
.trimHint { margin: 6px 0 0; font-size: 13px; color: var(--muted); }
.trimCanvasWrap {
position: relative; width: 100%; min-height: 120px;
border-radius: 14px; border: 1px solid rgba(255,255,255,0.08);
background: rgba(0,0,0,0.4); overflow: hidden;
}
.trimCanvas { width: 100%; height: 100%; display: block; }
.trimHandle {
position: absolute; top: 0; width: 48px; height: 100%;
cursor: ew-resize; pointer-events: auto; touch-action: none;
transform: translateX(-50%);
}
.trimHandle::after {
content: ''; position: absolute; top: 10%; bottom: 10%; left: 50%;
transform: translateX(-50%); width: 3px; border-radius: 2px;
background: var(--orange); box-shadow: 0 0 6px rgba(255,138,42,0.5);
}
.trimHandle::before {
content: ''; position: absolute; top: 50%; left: 50%;
transform: translate(-50%,-50%); width: 10px; height: 24px;
border-radius: 5px; border: 1px solid rgba(255,138,42,0.5);
background: rgba(255,138,42,0.15);
}
.trimTimeInfo {
display: flex; align-items: center; justify-content: center;
gap: 12px; font-size: 14px;
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
}
.trimSeparator { color: var(--muted); }
.trimVadInfo { display: flex; align-items: center; gap: 8px; font-size: 12px; }
.trimActions { display: flex; gap: 8px; flex-wrap: wrap; }
.trimActions button { flex: 1; min-width: 120px; }
.pill.trimBadge {
color: #89d4ff;
border-color: rgba(137,212,255,0.25);
background: rgba(137,212,255,0.08);
}
.trimBtn {
border-color: rgba(255,138,42,0.3);
background: rgba(255,138,42,0.1);
}
.trimBtn:hover {
border-color: rgba(255,138,42,0.5);
background: rgba(255,138,42,0.18);
}
@media (max-width: 720px) {
.wrap { padding: 18px 14px 30px; }
input[type="text"] { width: 100%; }
@@ -1033,6 +1166,15 @@
.trainFooter button {
width: 100%;
}
.trimOverlay { padding: 8px; }
.trimDialog {
width: 100%; height: 96vh;
padding: 14px; grid-template-rows: auto 1fr auto auto auto;
}
.trimCanvasWrap { min-height: 100px; }
.trimHeader { flex-direction: column; align-items: stretch; }
.trimActions { flex-direction: column; }
.trimActions button { width: 100%; min-width: unset; }
}
</style>
</head>
@@ -1218,7 +1360,7 @@
</div>
<section class="card studioPanel stack">
<div class="sampleLibraryHeader">
<div id="sampleLibraryHeader" class="sampleLibraryHeader">
<div class="studioPanelTitle">
<span class="studioStepBadge">1</span>
<div>
@@ -1241,6 +1383,7 @@
<div id="sampleLibraryList" class="capturedList">
<div class="emptyState">No samples saved yet.</div>
</div>
<div id="samplePagination"></div>
</section>
<section class="card studioPanel stack">
@@ -1414,6 +1557,40 @@
</div>
</div>
<div id="trimOverlay" class="trimOverlay" aria-hidden="true">
<div id="trimDialog" class="trimDialog" role="dialog" aria-modal="true">
<div class="trimHeader">
<div>
<h3 id="trimTitle" class="trimTitle">Trim Audio</h3>
<p id="trimHint" class="trimHint">Drag the handles to select a region, then save as a new sample.</p>
</div>
<button id="closeTrimBtn" type="button">Close</button>
</div>
<div class="trimCanvasWrap">
<canvas id="trimCanvas" class="trimCanvas"></canvas>
<div id="trimStartHandle" class="trimHandle" data-handle="start"></div>
<div id="trimEndHandle" class="trimHandle" data-handle="end"></div>
</div>
<div class="trimTimeInfo">
<span id="trimStartTime">0.00s</span>
<span class="trimSeparator">--</span>
<span id="trimEndTime">0.00s</span>
<span class="trimSeparator">|</span>
<span id="trimDuration">Duration: 0.00s</span>
</div>
<div id="trimVadInfo" class="trimVadInfo">
<span class="pill ok">VAD detected speech</span>
<span id="trimVadSegments" class="muted"></span>
</div>
<div class="trimActions">
<button id="trimPlayBtn" type="button">Play selection</button>
<button id="trimSelectFirstVadBtn" type="button">Select first VAD</button>
<button id="trimSaveBtn" class="primary" type="button">Save Trim</button>
<button id="trimCancelBtn" type="button">Cancel</button>
</div>
</div>
</div>
<script>
const $ = (id) => document.getElementById(id);
@@ -1423,7 +1600,7 @@
availableLanguages: [],
selectedFiles: [],
captured: { items: [], captured_count: 0, negative_count: 0, personal_count: 0 },
samples: { personal: [], negative: [], personal_count: 0, negative_count: 0, activeBucket: "personal" },
samples: { personal: [], negative: [], personal_count: 0, negative_count: 0, activeBucket: "personal", pages: { personal: 0, negative: 0 } },
firmware: { devices: [], templates: [], flashing: null, logLines: [], activeTemplateKey: "" },
uploadBusy: false,
reviewBusy: false,
@@ -1432,11 +1609,274 @@
firmwarePoller: null,
activeView: "trainer",
};
const SAMPLE_PAGE_SIZE = 50;
let firmwareProfileSaveTimer = null;
let firmwareProfileReloadTimer = null;
let wakeSoundPreviewAudio = null;
let wakeSoundPreviewButton = null;
// --- Trim Waveform Module ---
const TrimWaveform = {
audioBuffer: null,
duration: 0,
startRatio: 0,
endRatio: 1,
vadSegments: [],
isDragging: null,
async init(bucket, fileName) {
const audioUrl = `/api/audio/${encodeURIComponent(bucket)}/${encodeURIComponent(fileName)}`;
const resp = await fetch(audioUrl);
const arrayBuf = await resp.arrayBuffer();
const ctx = new (window.AudioContext || window.webkitAudioContext)();
this.audioBuffer = await ctx.decodeAudioData(arrayBuf);
this.duration = this.audioBuffer.duration;
ctx.close();
try {
const vadData = await api(
`/api/samples/${encodeURIComponent(bucket)}/${encodeURIComponent(fileName)}/vad`,
{ method: 'POST' }
);
this.vadSegments = vadData.segments || [];
} catch (e) {
console.warn('VAD failed:', e);
this.vadSegments = [];
}
this.startRatio = 0;
this.endRatio = 1;
if (this.vadSegments.length > 0) {
this.startRatio = this.vadSegments[0].start / this.duration;
this.endRatio = this.vadSegments[0].end / this.duration;
}
return { duration: this.duration, vadCount: this.vadSegments.length };
},
draw() {
const canvas = $('trimCanvas');
if (!canvas || !this.audioBuffer) return;
const dpr = window.devicePixelRatio || 1;
const rect = canvas.getBoundingClientRect();
canvas.width = rect.width * dpr;
canvas.height = rect.height * dpr;
const ctx = canvas.getContext('2d');
ctx.scale(dpr, dpr);
const w = rect.width, h = rect.height, mid = h / 2;
ctx.clearRect(0, 0, w, h);
// Full waveform (dim)
const data = this.audioBuffer.getChannelData(0);
const step = Math.max(1, Math.floor(data.length / w));
ctx.strokeStyle = 'rgba(255,255,255,0.12)';
ctx.lineWidth = 1;
ctx.beginPath();
for (let i = 0; i < w; i++) {
let mn = 1, mx = -1;
for (let j = 0; j < step; j++) {
const v = data[i * step + j] || 0;
if (v < mn) mn = v;
if (v > mx) mx = v;
}
ctx.moveTo(i, mid + mn * mid * 0.9);
ctx.lineTo(i, mid + mx * mid * 0.9);
}
ctx.stroke();
// Selection waveform (bright)
const selStartPx = this.startRatio * w;
const selEndPx = this.endRatio * w;
ctx.strokeStyle = 'rgba(255,138,42,0.7)';
ctx.lineWidth = 1.5;
ctx.beginPath();
for (let i = Math.floor(selStartPx); i <= Math.floor(selEndPx); i++) {
let mn = 1, mx = -1;
for (let j = 0; j < step; j++) {
const v = data[i * step + j] || 0;
if (v < mn) mn = v;
if (v > mx) mx = v;
}
ctx.moveTo(i, mid + mn * mid * 0.9);
ctx.lineTo(i, mid + mx * mid * 0.9);
}
ctx.stroke();
// Dim areas outside selection
ctx.fillStyle = 'rgba(0,0,0,0.45)';
ctx.fillRect(0, 0, selStartPx, h);
ctx.fillRect(selEndPx, 0, w - selEndPx, h);
// VAD segment markers (green lines)
this.vadSegments.forEach(seg => {
const x = (seg.start / this.duration) * w;
ctx.strokeStyle = 'rgba(57,212,160,0.4)';
ctx.lineWidth = 2;
ctx.beginPath();
ctx.moveTo(x, 0);
ctx.lineTo(x, h);
ctx.stroke();
});
// Update handle positions
$('trimStartHandle').style.left = (this.startRatio * 100) + '%';
$('trimEndHandle').style.left = (this.endRatio * 100) + '%';
},
getStartTime() { return this.startRatio * this.duration; },
getEndTime() { return this.endRatio * this.duration; },
setStartSeconds(s) {
this.startRatio = Math.max(0, Math.min(s / this.duration, this.endRatio - 0.001));
this.draw(); updateTrimTimeDisplay();
},
setEndSeconds(s) {
this.endRatio = Math.min(1, Math.max(s / this.duration, this.startRatio + 0.001));
this.draw(); updateTrimTimeDisplay();
},
playSelection() {
if (!this.audioBuffer) return;
const audioCtx = new (window.AudioContext || window.webkitAudioContext)();
const src = audioCtx.createBufferSource();
src.buffer = this.audioBuffer;
src.start(0, this.getStartTime(), this.getEndTime() - this.getStartTime());
src.connect(audioCtx.destination);
src.onended = () => audioCtx.close();
},
async getTrimmedWavBlob() {
const buf = this.audioBuffer;
const startSample = Math.floor(this.getStartTime() * buf.sampleRate);
const endSample = Math.min(Math.floor(this.getEndTime() * buf.sampleRate), buf.length);
const numSamples = endSample - startSample;
const targetRate = 16000;
let pcmFloat32;
if (buf.sampleRate === targetRate) {
pcmFloat32 = buf.getChannelData(0).slice(startSample, endSample);
} else {
const offlineCtx = new OfflineAudioContext(1, numSamples * (targetRate / buf.sampleRate) | 0, targetRate);
const src = offlineCtx.createBufferSource();
src.buffer = buf;
src.start(0, this.getStartTime(), this.getEndTime() - this.getStartTime());
src.connect(offlineCtx.destination);
const rendered = await offlineCtx.startRendering();
pcmFloat32 = rendered.getChannelData(0);
}
const int16 = new Int16Array(pcmFloat32.length);
for (let i = 0; i < pcmFloat32.length; i++) {
int16[i] = Math.max(-32768, Math.min(32767, Math.round(pcmFloat32[i] * 32767)));
}
const dataSize = int16.length * 2;
const wavSize = 36 + dataSize;
const wavBuf = new ArrayBuffer(44 + dataSize);
const view = new DataView(wavBuf);
view.setUint32(0, 0x52494646, false);
view.setUint32(4, wavSize, true);
view.setUint32(8, 0x57415645, false);
view.setUint32(12, 0x666d7420, false);
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, targetRate, true);
view.setUint32(28, targetRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
view.setUint32(36, 0x64617461, false);
view.setUint32(40, dataSize, true);
for (let i = 0; i < int16.length; i++) {
view.setInt16(44 + i * 2, int16[i], true);
}
return new Blob([wavBuf], { type: 'audio/wav' });
},
onPointerDown(handleType, e) {
this.isDragging = handleType;
e.preventDefault();
},
onPointerMove(e) {
if (!this.isDragging) return;
e.preventDefault();
const canvas = $('trimCanvas');
const rect = canvas.getBoundingClientRect();
const clientX = e.touches ? e.touches[0].clientX : e.clientX;
let ratio = (clientX - rect.left) / rect.width;
ratio = Math.max(0, Math.min(1, ratio));
if (this.isDragging === 'start') {
this.startRatio = Math.max(0, Math.min(ratio, this.endRatio - 0.002));
} else {
this.endRatio = Math.min(1, Math.max(ratio, this.startRatio + 0.002));
}
this.draw();
updateTrimTimeDisplay();
},
onPointerUp() { this.isDragging = null; },
destroy() {
this.audioBuffer = null;
this.vadSegments = [];
this.isDragging = null;
}
};
// --- Trim Modal Functions ---
let trimBucket = null;
let trimFileName = null;
async function openTrimModal(bucket, fileName) {
trimBucket = bucket;
trimFileName = fileName;
const overlay = $('trimOverlay');
overlay.classList.add('open');
overlay.setAttribute('aria-hidden', 'false');
$('trimHint').textContent = 'Loading audio...';
$('trimSaveBtn').disabled = true;
$('trimPlayBtn').disabled = true;
try {
const info = await TrimWaveform.init(bucket, fileName);
$('trimHint').textContent = `Drag the handles to select a region, then save as a new sample. Duration: ${info.duration.toFixed(2)}s`;
$('trimSaveBtn').disabled = false;
$('trimPlayBtn').disabled = false;
if (info.vadCount > 0) {
$('trimVadInfo').style.display = 'flex';
$('trimVadSegments').textContent = `${info.vadCount} speech segment${info.vadCount > 1 ? 's' : ''} detected (first auto-selected)`;
} else {
$('trimVadInfo').style.display = 'none';
}
requestAnimationFrame(() => {
TrimWaveform.draw();
updateTrimTimeDisplay();
});
} catch (e) {
$('trimHint').textContent = 'Failed to load audio: ' + e.message;
alert('Failed to load audio for trimming: ' + e.message);
closeTrimModal();
}
}
function closeTrimModal() {
$('trimOverlay').classList.remove('open');
$('trimOverlay').setAttribute('aria-hidden', 'true');
$('trimSaveBtn').disabled = false;
$('trimSaveBtn').textContent = 'Save Trim';
$('trimPlayBtn').disabled = false;
TrimWaveform.destroy();
trimBucket = null;
trimFileName = null;
}
function updateTrimTimeDisplay() {
const start = TrimWaveform.getStartTime();
const end = TrimWaveform.getEndTime();
$('trimStartTime').textContent = start.toFixed(2) + 's';
$('trimEndTime').textContent = end.toFixed(2) + 's';
$('trimDuration').textContent = 'Duration: ' + (end - start).toFixed(2) + 's';
}
function setPill(el, text, cls) {
el.className = "pill " + (cls || "");
el.textContent = text;
@@ -1588,6 +2028,43 @@
$("sampleNegativeCount").textContent = String(negativeCount);
}
function buildSampleCardHtml(item, bucket) {
const when = formatTimestamp(item.reviewed_at || item.received_at || item.created_at);
const formatSummary = item.final_format ? describeFormat(item.final_format) : "16 kHz, mono, 16-bit";
const badge = bucket === "negative" ? { label: "Negative", cls: "err" } : { label: "Positive", cls: "ok" };
let trimBadgeHtml = '';
if (item.trimmed) {
trimBadgeHtml = `<span class="pill trimBadge">Trimmed from ${escapeHtml(item.source_file || '')}</span>`;
}
const subtitleParts = [];
if (item.original_name && item.original_name !== item.saved_as) subtitleParts.push(`From ${item.original_name}`);
if (when) subtitleParts.push(`Saved ${when}`);
if (item.message) subtitleParts.push(item.message);
let revertBtn = '';
if (item.trimmed) {
revertBtn = `<button type="button" data-sample-revert="${escapeAttr(item.saved_as)}" data-bucket="${escapeAttr(bucket)}">Revert</button>`;
}
return `
<div class="captureCard">
<div class="row space">
<div>
<p class="captureTitle">${escapeHtml(item.saved_as)}</p>
<p class="captureSubtitle">${escapeHtml(subtitleParts.join(" · ") || "Saved training sample.")}</p>
</div>
<span class="pill ${badge.cls}">${badge.label}</span>
${trimBadgeHtml}
</div>
<audio class="audioPlayer" controls preload="none" src="${escapeAttr(item.audio_url || `/api/audio/${bucket}/${encodeURIComponent(item.saved_as)}`)}?t=${encodeURIComponent(item.created_at || '')}"></audio>
<div class="muted">Stored in ${bucket === "negative" ? "negative_samples" : "personal_samples"} · ${escapeHtml(formatSummary)}</div>
<div class="captureActions">
${revertBtn}
<button type="button" data-sample-trim="${escapeAttr(item.saved_as)}" data-bucket="${escapeAttr(bucket)}" class="trimBtn">Trim</button>
<button type="button" data-sample-remove="${escapeAttr(item.saved_as)}" data-bucket="${escapeAttr(bucket)}" ${uiState.reviewBusy ? "disabled" : ""}>Remove sample</button>
</div>
</div>
`;
}
function renderSampleLibrary(payload) {
const data = payload || { personal: [], negative: [], personal_count: 0, negative_count: 0 };
uiState.samples = {
@@ -1606,34 +2083,36 @@
const label = activeBucket === "negative" ? "negative" : "personal";
if (!items.length) {
$("sampleLibraryList").innerHTML = `<div class="emptyState">No ${label} samples saved yet.</div>`;
$("samplePagination").innerHTML = "";
return;
}
$("sampleLibraryList").innerHTML = items.map((item) => {
const when = formatTimestamp(item.reviewed_at || item.received_at || item.created_at);
const formatSummary = item.final_format ? describeFormat(item.final_format) : "16 kHz, mono, 16-bit";
const badge = activeBucket === "negative" ? { label: "Negative", cls: "err" } : { label: "Positive", cls: "ok" };
const subtitleParts = [];
if (item.original_name && item.original_name !== item.saved_as) subtitleParts.push(`From ${item.original_name}`);
if (when) subtitleParts.push(`Saved ${when}`);
if (item.message) subtitleParts.push(item.message);
return `
<div class="captureCard">
<div class="row space">
<div>
<p class="captureTitle">${escapeHtml(item.saved_as)}</p>
<p class="captureSubtitle">${escapeHtml(subtitleParts.join(" · ") || "Saved training sample.")}</p>
</div>
<span class="pill ${badge.cls}">${badge.label}</span>
</div>
<audio class="audioPlayer" controls preload="none" src="${escapeAttr(item.audio_url || `/api/audio/${activeBucket}/${encodeURIComponent(item.saved_as)}`)}"></audio>
<div class="muted">Stored in ${activeBucket === "negative" ? "negative_samples" : "personal_samples"} · ${escapeHtml(formatSummary)}</div>
<div class="captureActions">
<button type="button" data-sample-remove="${escapeAttr(item.saved_as)}" data-bucket="${escapeAttr(activeBucket)}" ${uiState.reviewBusy ? "disabled" : ""}>Remove sample</button>
</div>
// Paginate
const pages = uiState.samples.pages || { personal: 0, negative: 0 };
let page = pages[activeBucket] || 0;
const totalPages = Math.ceil(items.length / SAMPLE_PAGE_SIZE);
if (page >= totalPages) page = Math.max(totalPages - 1, 0);
const start = page * SAMPLE_PAGE_SIZE;
const pageItems = items.slice(start, start + SAMPLE_PAGE_SIZE);
$("sampleLibraryList").innerHTML = pageItems.map((item) => buildSampleCardHtml(item, activeBucket)).join("");
// Pagination controls
const pagination = $("samplePagination");
if (totalPages > 1) {
const prevDisabled = page === 0 ? "disabled" : "";
const nextDisabled = page >= totalPages - 1 ? "disabled" : "";
pagination.innerHTML = `
<div class="paginationControls">
<button type="button" ${prevDisabled} class="pageBtn" data-page="prev"> Prev</button>
<span class="pageInfo">${page + 1} / ${totalPages} (${items.length} total)</span>
<span class="pageJump">Go to page <input type="number" min="1" max="${totalPages}" value="${page + 1}" class="pageInput" data-total="${totalPages}"> <button type="button" class="pageJumpBtn">Go</button></span>
<button type="button" ${nextDisabled} class="pageBtn" data-page="next">Next </button>
</div>
`;
}).join("");
} else {
pagination.innerHTML = "";
}
}
function rerenderReviewLists() {
@@ -1664,8 +2143,8 @@
uiState.reviewBusy = true;
setPill($("status"), "Removing sample...", "warn");
syncButtons();
const data = await api(`/api/samples/${encodeURIComponent(bucket)}/${encodeURIComponent(fileName)}`, { method: "DELETE" });
renderSampleLibrary(data);
await api(`/api/samples/${encodeURIComponent(bucket)}/${encodeURIComponent(fileName)}`, { method: "DELETE" });
await refreshSamples();
await refreshSession();
setPill($("status"), "Sample removed", "ok");
} catch (error) {
@@ -1673,7 +2152,6 @@
alert(error.message);
} finally {
uiState.reviewBusy = false;
rerenderReviewLists();
}
}
@@ -2911,7 +3389,65 @@
});
$("sampleTabPersonal").addEventListener("click", () => setSampleBucket("personal"));
$("sampleTabNegative").addEventListener("click", () => setSampleBucket("negative"));
function navigateToSamplePage(page) {
const header = document.getElementById("sampleLibraryHeader");
const topY = header.getBoundingClientRect().top + window.scrollY;
const activeBucket = uiState.samples.activeBucket === "negative" ? "negative" : "personal";
const pages = uiState.samples.pages || { personal: 0, negative: 0 };
pages[activeBucket] = page;
uiState.samples.pages = pages;
renderSampleLibrary(uiState.samples);
syncButtons();
window.scrollTo(0, topY);
}
$("samplePagination").addEventListener("click", (event) => {
const btn = event.target.closest(".pageBtn[data-page]");
if (btn) {
btn.blur();
const activeBucket = uiState.samples.activeBucket === "negative" ? "negative" : "personal";
const pages = uiState.samples.pages || { personal: 0, negative: 0 };
let page = pages[activeBucket] || 0;
if (btn.dataset.page === "prev") page = Math.max(page - 1, 0);
else if (btn.dataset.page === "next") page = Math.min(page + 1, 999);
navigateToSamplePage(page);
return;
}
const jumpBtn = event.target.closest(".pageJumpBtn");
if (jumpBtn) {
const input = jumpBtn.parentElement.querySelector(".pageInput");
const totalPages = parseInt(input.dataset.total) || 1;
let page = parseInt(input.value) - 1;
if (isNaN(page) || page < 0) page = 0;
if (page >= totalPages) page = totalPages - 1;
input.blur();
navigateToSamplePage(page);
}
});
$("sampleLibraryList").addEventListener("click", async (event) => {
// Revert trimmed sample
const revertBtn = event.target.closest("button[data-sample-revert][data-bucket]");
if (revertBtn) {
const ok = confirm(`Revert ${revertBtn.dataset.sampleRevert} to the original (pre-trim) version?`);
if (!ok) return;
try {
const form = new FormData();
form.append('bucket', revertBtn.dataset.bucket);
form.append('file_name', revertBtn.dataset.sampleRevert);
const result = await api('/api/samples/revert', { method: 'POST', body: form });
await refreshSamples();
setPill($("status"), result.message || 'Reverted', 'ok');
} catch (err) {
alert('Revert failed: ' + err.message);
}
return;
}
// Open trim modal
const trimBtn = event.target.closest("button[data-sample-trim][data-bucket]");
if (trimBtn) {
openTrimModal(trimBtn.dataset.bucket, trimBtn.dataset.sampleTrim);
return;
}
// Remove sample
const button = event.target.closest("button[data-sample-remove][data-bucket]");
if (!button) return;
await removeSample(button.dataset.bucket, button.dataset.sampleRemove);
@@ -3027,6 +3563,66 @@
} catch (_) {}
}
// --- Trim Modal Event Listeners ---
$("closeTrimBtn").addEventListener('click', closeTrimModal);
$("trimCancelBtn").addEventListener('click', closeTrimModal);
$("trimOverlay").addEventListener('click', (e) => {
if (e.target === $("trimOverlay")) closeTrimModal();
});
$("trimPlayBtn").addEventListener('click', () => TrimWaveform.playSelection());
$("trimSelectFirstVadBtn").addEventListener('click', () => {
if (TrimWaveform.vadSegments.length > 0) {
const seg = TrimWaveform.vadSegments[0];
TrimWaveform.setStartSeconds(seg.start);
TrimWaveform.setEndSeconds(seg.end);
}
});
$("trimSaveBtn").addEventListener('click', async () => {
const start = TrimWaveform.getStartTime();
const end = TrimWaveform.getEndTime();
$("trimSaveBtn").disabled = true;
$("trimSaveBtn").textContent = 'Saving...';
try {
const blob = await TrimWaveform.getTrimmedWavBlob();
const form = new FormData();
form.append('file', blob, 'trimmed.wav');
form.append('bucket', trimBucket);
form.append('source_file', trimFileName);
form.append('start_time', start.toFixed(3));
form.append('end_time', end.toFixed(3));
const result = await api('/api/samples/trim', {
method: 'POST',
body: form,
});
closeTrimModal();
await refreshSamples();
setPill($("status"), result.message || 'Trim saved', 'ok');
} catch (e) {
alert('Trim failed: ' + e.message);
$("trimSaveBtn").disabled = false;
$("trimSaveBtn").textContent = 'Save Trim';
}
});
// Handle drag (mouse + touch)
$("trimStartHandle").addEventListener('mousedown', (e) => TrimWaveform.onPointerDown('start', e));
$("trimEndHandle").addEventListener('mousedown', (e) => TrimWaveform.onPointerDown('end', e));
$("trimStartHandle").addEventListener('touchstart', (e) => TrimWaveform.onPointerDown('start', e), { passive: false });
$("trimEndHandle").addEventListener('touchstart', (e) => TrimWaveform.onPointerDown('end', e), { passive: false });
document.addEventListener('mousemove', (e) => TrimWaveform.onPointerMove(e));
document.addEventListener('touchmove', (e) => TrimWaveform.onPointerMove(e), { passive: false });
document.addEventListener('mouseup', () => TrimWaveform.onPointerUp());
document.addEventListener('touchend', () => TrimWaveform.onPointerUp());
// Redraw waveform on window resize
window.addEventListener('resize', () => {
if ($('trimOverlay').classList.contains('open')) TrimWaveform.draw();
});
bootstrap();
</script>
</body>

View File

@@ -37,10 +37,11 @@ STATIC_DIR = Path(os.environ.get("STATIC_DIR", str(ROOT_DIR / "static"))).resolv
PERSONAL_DIR = Path(os.environ.get("PERSONAL_DIR", str(DATA_DIR / "personal_samples"))).resolve()
CAPTURED_DIR = Path(os.environ.get("CAPTURED_DIR", str(DATA_DIR / "captured_audio"))).resolve()
NEGATIVE_DIR = Path(os.environ.get("NEGATIVE_DIR", str(DATA_DIR / "negative_samples"))).resolve()
TRIM_HISTORY_DIR = Path(os.environ.get("TRIM_HISTORY_DIR", str(DATA_DIR / "trim_history"))).resolve()
TRIM_HISTORY_DIR.mkdir(parents=True, exist_ok=True)
TRAINED_WAKE_WORDS_DIR = Path(
os.environ.get("TRAINED_WAKE_WORDS_DIR", str(DATA_DIR / "trained_wake_words"))
).resolve()
CLI_DIR = Path(os.environ.get("CLI_DIR", str(ROOT_DIR / "cli"))).resolve()
PIPER_ROOT = DATA_DIR / "tools" / "piper-sample-generator"
PIPER_VOICES_DIR = PIPER_ROOT / "voices"
@@ -169,11 +170,58 @@ FIRMWARE_LOCK = threading.Lock()
FIRMWARE_SESSIONS: Dict[str, Dict[str, Any]] = {}
ANSI_ESCAPE_RE = re.compile(r"\x1B(?:\[[0-?]*[ -/]*[@-~]|[@-Z\\-_])")
# --- Silero VAD (lazy-loaded) ---
_silero_vad_model = None
_silero_vad_utils = None
_SILERO_VAD_LOCK = threading.Lock()
VAD_SELECTION_PAD_START_S = 0.08
VAD_SELECTION_PAD_END_S = 0.08
def _load_silero_vad():
"""Lazy-load Silero VAD model on first use. Returns (model, utils)."""
global _silero_vad_model, _silero_vad_utils
if _silero_vad_model is not None:
return _silero_vad_model, _silero_vad_utils
with _SILERO_VAD_LOCK:
if _silero_vad_model is not None:
return _silero_vad_model, _silero_vad_utils
import torch
import silero_vad
model = silero_vad.load_silero_vad()
model.eval()
_silero_vad_model = model
_silero_vad_utils = {"torch": torch}
return model, _silero_vad_utils
def _detect_speech_segments(wav_bytes: bytes) -> List[Dict[str, float]]:
"""Run Silero VAD on 16 kHz mono WAV bytes. Return {start, end} seconds."""
model, utils = _load_silero_vad()
torch = utils["torch"]
import numpy as np
from silero_vad.utils_vad import get_speech_timestamps
with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
raw = wf.readframes(wf.getnframes())
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
audio_tensor = torch.from_numpy(samples)
timestamps = get_speech_timestamps(
audio_tensor,
model,
sampling_rate=16000,
threshold=0.5,
min_speech_duration_ms=150,
min_silence_duration_ms=100,
return_seconds=True,
)
return [{"start": round(ts["start"], 3), "end": round(ts["end"], 3)} for ts in timestamps]
class _FirmwareYamlLoader(yaml.SafeLoader):
pass
class _FirmwareYamlDumper(yaml.SafeDumper):
pass
@@ -1009,7 +1057,7 @@ def _list_captured_items() -> List[Dict[str, Any]]:
def _sample_item_from_path(audio_path: Path, bucket: str) -> Dict[str, Any]:
meta = _load_sidecar_json(audio_path)
stat = audio_path.stat()
final_format = meta.get("final_format") or _inspect_wav_bytes(audio_path.read_bytes()) or {}
final_format = meta.get("final_format") or meta.get("detected_format") or _inspect_wav_bytes(audio_path.read_bytes()) or {}
return {
"bucket": bucket,
"saved_as": audio_path.name,
@@ -1021,6 +1069,8 @@ def _sample_item_from_path(audio_path: Path, bucket: str) -> Dict[str, Any]:
"reviewed_at": meta.get("reviewed_at") or "",
"created_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
"converted": bool(meta.get("converted")),
"trimmed": bool(meta.get("trimmed")),
"source_file": meta.get("source_file") or "",
"final_format": final_format,
"message": meta.get("message") or "",
"size_bytes": stat.st_size,
@@ -1036,9 +1086,10 @@ def _list_sample_items(directory: Path, bucket: str) -> List[Dict[str, Any]]:
items.append(_sample_item_from_path(audio_path, bucket))
except Exception:
continue
# Untrimmed first (stable sort preserves mtime order within each group).
items.sort(key=lambda x: x.get("trimmed", False))
return items
def _samples_payload() -> Dict[str, Any]:
takes = _sync_personal_samples_state()
personal_items = _list_sample_items(PERSONAL_DIR, "personal")
@@ -2768,7 +2819,143 @@ def delete_sample(bucket: str, file_name: str):
_remove_audio_with_sidecar(path)
except FileNotFoundError as e:
return JSONResponse({"ok": False, "error": str(e)}, status_code=404)
return _samples_payload()
return {"ok": True, "deleted_bucket": bucket, "deleted_file": file_name, "message": f"Deleted {file_name}"}
@app.post("/api/samples/{bucket}/{file_name}/vad")
def vad_segments(bucket: str, file_name: str):
bucket_map = {"personal": PERSONAL_DIR, "negative": NEGATIVE_DIR}
directory = bucket_map.get(bucket)
if directory is None:
return JSONResponse({"ok": False, "error": "Unknown sample bucket."}, status_code=404)
try:
path = _resolve_audio_path(directory, file_name)
except FileNotFoundError as e:
return JSONResponse({"ok": False, "error": str(e)}, status_code=404)
wav_bytes = path.read_bytes()
try:
all_segments = _detect_speech_segments(wav_bytes)
except Exception as e:
return JSONResponse({"ok": False, "error": f"VAD failed: {str(e)}"}, status_code=500)
# Only return the first segment longer than 250 ms. Add deterministic
# padding so VAD guides trimming without clipping quiet wake-word edges.
filtered = [s for s in all_segments if (s["end"] - s["start"]) >= 0.25]
if not filtered:
return {"ok": True, "file_name": file_name, "segments": [], "segment_count": 0}
seg = filtered[0]
info = _inspect_wav_bytes(wav_bytes) or {}
duration_s = float(info.get("duration_s") or 0.0)
start = max(0.0, round(seg["start"] - VAD_SELECTION_PAD_START_S, 3))
end = round(seg["end"] + VAD_SELECTION_PAD_END_S, 3)
if duration_s > 0:
end = min(duration_s, end)
if end <= start:
end = start + 0.001
segment = {"start": start, "end": end}
return {"ok": True, "file_name": file_name, "segments": [segment], "segment_count": 1}
@app.post("/api/samples/trim")
async def trim_sample_upload(
file: UploadFile = File(...),
bucket: str = Form(...),
source_file: str = Form(...),
start_time: str | None = Form(None),
end_time: str | None = Form(None),
):
bucket_map = {"personal": PERSONAL_DIR, "negative": NEGATIVE_DIR}
directory = bucket_map.get(bucket)
if directory is None:
return JSONResponse({"ok": False, "error": "Unknown sample bucket."}, status_code=404)
data = await file.read()
if not data:
return JSONResponse({"ok": False, "error": "Empty audio file."}, status_code=400)
info = _inspect_wav_bytes(data)
if not info:
try:
data = _normalize_audio_to_target_wav(data, file.filename or "trimmed.wav")
except Exception as e:
return JSONResponse({"ok": False, "error": f"Audio normalization failed: {e}"}, status_code=400)
elif not _is_target_wav(info):
try:
data = _normalize_audio_to_target_wav(data, file.filename or "trimmed.wav")
except Exception as e:
return JSONResponse({"ok": False, "error": f"Audio normalization failed: {e}"}, status_code=400)
try:
orig_path = _resolve_audio_path(directory, source_file)
except FileNotFoundError as e:
return JSONResponse({"ok": False, "error": str(e)}, status_code=404)
TRIM_HISTORY_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S%f")
backup_name = f"{ts}_{source_file}"
backup_path = TRIM_HISTORY_DIR / backup_name
shutil.copy2(orig_path, backup_path)
orig_sidecar = _audio_sidecar_path(orig_path)
if orig_sidecar.exists():
shutil.copy2(orig_sidecar, _audio_sidecar_path(backup_path))
orig_path.write_bytes(data)
old_sidecar = _load_sidecar_json(orig_path)
sidecar = {
**old_sidecar,
"trimmed": True,
"source_file": source_file,
"source_bucket": bucket,
"trim_start_s": float(start_time) if start_time else None,
"trim_end_s": float(end_time) if end_time else None,
"undo_backup_file": backup_name,
}
_write_sidecar_json(orig_path, sidecar)
updated_item = _sample_item_from_path(orig_path, bucket)
updated_item["trimmed"] = True
updated_item["source_file"] = source_file
return {"ok": True, "updated_sample": updated_item, "message": f"Trimmed {source_file}"}
@app.post("/api/samples/revert")
def revert_trim(
bucket: str = Form(...),
file_name: str = Form(...),
):
bucket_map = {"personal": PERSONAL_DIR, "negative": NEGATIVE_DIR}
directory = bucket_map.get(bucket)
if directory is None:
return JSONResponse({"ok": False, "error": "Unknown sample bucket."}, status_code=404)
try:
file_path = _resolve_audio_path(directory, file_name)
except FileNotFoundError as e:
return JSONResponse({"ok": False, "error": str(e)}, status_code=404)
sidecar = _load_sidecar_json(file_path)
backup_name = sidecar.get("undo_backup_file")
if not backup_name:
return JSONResponse({"ok": False, "error": "No trim backup found for this sample."}, status_code=400)
backup_path = TRIM_HISTORY_DIR / backup_name
if not backup_path.exists():
return JSONResponse({"ok": False, "error": "Trim backup file missing."}, status_code=404)
shutil.copy2(backup_path, file_path)
backup_sidecar = _audio_sidecar_path(backup_path)
if backup_sidecar.exists():
shutil.copy2(backup_sidecar, _audio_sidecar_path(file_path))
backup_path.unlink()
if backup_sidecar.exists():
backup_sidecar.unlink()
updated_item = _sample_item_from_path(file_path, bucket)
return {"ok": True, "updated_sample": updated_item, "message": f"Reverted {file_name}"}
@app.post("/api/captured_audio/{file_name}/approve_personal")