#!/bin/bash
set -euo pipefail

PROGPATH=$(realpath "$0")
PROGDIR=$(dirname "${PROGPATH}")

source "${PROGDIR}/shell.functions"

if [ "${HELP}" == "true" ] ; then
    cat <<EOF >&2
Usage: $0 [ --cleanup-archives ] [ --cleanup-input-files ] [ --data-dir=<data_dir> ]

    --cleanup-archives
                          : Automatically clean up any downloaded archives after
                          : extraction.
    --cleanup-intermediate-files
                          : Automatically clean up intermediate extracted files
                          : after conversion to 16k.
    <data_dir>            : Path to the data directory.
                          : Default: ${DATA_DIR}

EOF
    exit 1
fi

mkdir -p "${DATA_DIR}/training_datasets/downloads" || :
cd "${DATA_DIR}/training_datasets"

echo "***** Checking CHiME-Home *****"

AUDIO_URL="https://archive.org/download/chime-home/chime_home.tar.gz"
AUDIO_TARFILE="chime_home.tar.gz"
AUDIO_TAR="./downloads/${AUDIO_TARFILE}"
AUDIO_DIR="./chime"
mkdir -p "${AUDIO_DIR}" || :
AUDIO16K_DIR="./chime_16k"
mkdir -p "${AUDIO16K_DIR}" || :
AUDIO_FILECOUNT="./downloads/chime_filecount"
AUDIO_IN_GLOB="*.48kHz.wav"

declare -A filecounts=( [${AUDIO_TARFILE}]=0 )
get_filecounts filecounts "${AUDIO_FILECOUNT}"

converter() {
    source "${DATA_DIR}/.venv/bin/activate"

    python - "${AUDIO_DIR}" "${AUDIO16K_DIR}" <<-EOF
import sys
from pathlib import Path
import numpy as np
import scipy.io.wavfile
import librosa
from tqdm import tqdm

def write_wav(dst: Path, data: np.ndarray, sr: int):
    dst.parent.mkdir(parents=True, exist_ok=True)
    x = np.clip(data, -1.0, 1.0)
    scipy.io.wavfile.write(dst, sr, (x * 32767).astype(np.int16))

def flatten_name(root: Path, src: Path) -> str:
    rel = src.relative_to(root)
    return "__".join(rel.parts)

chime_in = Path(sys.argv[1]).resolve()
chime_out = Path(sys.argv[2]).resolve()

wavs = list(chime_in.rglob("*.48kHz.wav"))
print(f"   WAV files: {len(wavs)}")
print("   Converting CHiME -> 16k mono WAV")

bad = []
ok = 0
skipped = 0
for p in tqdm(wavs, desc="   CHiME -> WAV (resample 16k mono)"):
    try:
        out_name = flatten_name(chime_in, p)
        outfile = chime_out / out_name
        if outfile.exists():
            skipped += 1
            continue
        y, _ = librosa.load(p, sr=16000, mono=True)
        if y.size == 0:
            raise ValueError("empty audio")
        write_wav(outfile, y, 16000)
        ok += 1
    except Exception as e:
        bad.append(f"{p}:{e}")

if bad:
    (chime_out / "chime_corrupted_files.log").write_text("\\n".join(bad))
print(f"   CHiME complete ({ok} ok, {skipped} skipped, {len(bad)} failed)")
EOF
}

expected_filecount=${filecounts[${AUDIO_TARFILE}]}
actual_filecount=$(find "${AUDIO16K_DIR}" -name '*.wav' 2>/dev/null | wc -l) || :
write_filecount=false

if [ "${actual_filecount}" -ne 0 ] && [ "${actual_filecount}" -eq "${expected_filecount}" ] ; then
    echo "   Existing ${AUDIO16K_DIR} valid"
else
    actual_filecount=$(find "${AUDIO_DIR}" -name "${AUDIO_IN_GLOB}" 2>/dev/null | wc -l) || :
    if [ "${actual_filecount}" -eq 0 ] || [ "${actual_filecount}" -ne "${expected_filecount}" ] ; then
        if [ ! -f "${AUDIO_TAR}" ] ; then
            echo "   Downloading ${AUDIO_TARFILE}"
            curl -sfL "${AUDIO_URL}" -o "${AUDIO_TAR}"
        fi

        rm -rf "${AUDIO_DIR}" || :
        mkdir -p "${AUDIO_DIR}" || :
        echo "   Untarring ${AUDIO_TARFILE}"
        tar -xzf "${AUDIO_TAR}" -C "${AUDIO_DIR}"
    fi

    if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_TAR}" ] ; then
        echo "   Cleaning up ${AUDIO_TARFILE}"
        rm -rf "${AUDIO_TAR}"
    fi

    converter

    actual_filecount=$(find "${AUDIO16K_DIR}" -name "*.wav" 2>/dev/null | wc -l) || :
    filecounts[${AUDIO_TARFILE}]="${actual_filecount}"
    write_filecount=true
fi

if ${write_filecount} ; then
    write_filecounts filecounts "${AUDIO_FILECOUNT}"
fi

if "${CLEANUP_ARCHIVES}" && [ -f "${AUDIO_TAR}" ] ; then
    echo "   Cleaning up ${AUDIO_TARFILE}"
    rm -rf "${AUDIO_TAR}"
fi

if "${CLEANUP_INTERMEDIATE_FILES}" && [ -d "${AUDIO_DIR}" ] ; then
    echo "   Cleaning up ${AUDIO_DIR}"
    rm -rf "${AUDIO_DIR}"
fi

echo "   CHiME complete"
exit 0
