whisper.cpp/benchmark/bench.sh

512 lines
14 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
AUDIO_DIR="${SCRIPT_DIR}/audio"
REFS_DIR="${SCRIPT_DIR}/references"
RESULTS_ROOT="${SCRIPT_DIR}/results"
LOCK_FILE="${AUDIO_DIR}/lock.json"
# Fixed benchmark configuration for reproducibility.
WARMUP_RUNS=1
MEASURED_RUNS=5
THREADS=8
PROCESSORS=1
VARIANT="metal-baseline"
MODEL_REL_DEFAULT="models/ggml-small.en.bin"
MAX_WER=0.02
MAX_CER=0.02
ENFORCE_CORRECTNESS=1
AUDIO_KEYS=()
CLI_BIN="${REPO_ROOT}/build/bin/whisper-cli"
CLI_ARGS=(
"-l" "en"
"-tp" "0"
"-tpi" "0"
"-nf"
"-bs" "1"
"-bo" "1"
"-fa"
)
usage() {
cat <<'EOF'
Usage:
benchmark/bench.sh --create-lock
benchmark/bench.sh [--variant <name>] [--audio <short|medium|long>]...
benchmark/bench.sh [--variant <name>] --all-audio
Description:
--create-lock Validates benchmark/audio/{short,medium,long}.wav and writes benchmark/audio/lock.json
with file hashes + durations. This lock is required for benchmark runs.
--variant Logical variant label in output tables (default: metal-baseline).
--audio Add one audio key to run: short, medium, or long.
If omitted, only short is run (development default).
--all-audio Run short + medium + long.
Notes:
- This harness runs 1 warm-up + 5 measured runs per audio sample.
- Runs are always sequential and use fixed model + fixed CLI flags.
- Audio files must be 16 kHz, mono, 16-bit WAV.
- Correctness is computed via WER/CER against benchmark/references/{short,medium,long}.txt.
EOF
}
MODE="run"
while [[ $# -gt 0 ]]; do
case "$1" in
--create-lock)
MODE="create-lock"
shift
;;
--variant)
VARIANT="$2"
shift 2
;;
--audio)
case "$2" in
short|medium|long)
AUDIO_KEYS+=( "$2" )
;;
*)
echo "Invalid --audio value: $2 (expected short|medium|long)" >&2
exit 2
;;
esac
shift 2
;;
--all-audio)
AUDIO_KEYS=( "short" "medium" "long" )
shift
;;
--help|-h)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
require_file() {
local path="$1"
if [[ ! -f "$path" ]]; then
echo "Required file not found: $path" >&2
exit 1
fi
}
create_lock_file() {
mkdir -p "${AUDIO_DIR}"
require_file "${REPO_ROOT}/${MODEL_REL_DEFAULT}"
require_file "${AUDIO_DIR}/short.wav"
require_file "${AUDIO_DIR}/medium.wav"
require_file "${AUDIO_DIR}/long.wav"
python3 - "${LOCK_FILE}" "${REPO_ROOT}" "${MODEL_REL_DEFAULT}" "${AUDIO_DIR}" <<'PY'
import contextlib
import hashlib
import json
import os
import sys
import wave
from pathlib import Path
lock_path = Path(sys.argv[1])
repo_root = Path(sys.argv[2])
model_rel = sys.argv[3]
audio_dir = Path(sys.argv[4])
def sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
while True:
chunk = f.read(1024 * 1024)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def inspect_wav(path: Path):
with contextlib.closing(wave.open(str(path), "rb")) as wf:
channels = wf.getnchannels()
sample_rate = wf.getframerate()
sample_width = wf.getsampwidth()
frames = wf.getnframes()
if channels != 1:
raise SystemExit(f"{path}: expected mono (1 channel), got {channels}")
if sample_rate != 16000:
raise SystemExit(f"{path}: expected 16000 Hz, got {sample_rate}")
if sample_width != 2:
raise SystemExit(f"{path}: expected 16-bit PCM (2 bytes), got {sample_width}")
duration_s = frames / float(sample_rate)
return {
"sha256": sha256(path),
"duration_s": duration_s,
"sample_rate_hz": sample_rate,
"channels": channels,
"sample_width_bytes": sample_width,
}
model_path = repo_root / model_rel
if not model_path.is_file():
raise SystemExit(f"Model not found: {model_path}")
lock = {
"schema_version": 1,
"model_rel": model_rel,
"model_sha256": sha256(model_path),
"audio": {
"short": inspect_wav(audio_dir / "short.wav"),
"medium": inspect_wav(audio_dir / "medium.wav"),
"long": inspect_wav(audio_dir / "long.wav"),
},
}
tmp = lock_path.with_suffix(".json.tmp")
tmp.write_text(json.dumps(lock, indent=2, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp, lock_path)
print(f"Wrote lock file: {lock_path}")
PY
}
validate_inputs_against_lock() {
require_file "${LOCK_FILE}"
require_file "${CLI_BIN}"
local validation_json="$1"
python3 - "${LOCK_FILE}" "${REPO_ROOT}" "${AUDIO_DIR}" "${MODEL_REL_DEFAULT}" > "${validation_json}" <<'PY'
import contextlib
import hashlib
import json
import sys
import wave
from pathlib import Path
lock_path = Path(sys.argv[1])
repo_root = Path(sys.argv[2])
audio_dir = Path(sys.argv[3])
configured_model_rel = sys.argv[4]
def sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
while True:
chunk = f.read(1024 * 1024)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def inspect_wav(path: Path):
with contextlib.closing(wave.open(str(path), "rb")) as wf:
channels = wf.getnchannels()
sample_rate = wf.getframerate()
sample_width = wf.getsampwidth()
frames = wf.getnframes()
duration_s = frames / float(sample_rate)
return channels, sample_rate, sample_width, duration_s
lock = json.loads(lock_path.read_text(encoding="utf-8"))
if lock.get("model_rel") != configured_model_rel:
raise SystemExit(
f"Lock model_rel mismatch: lock={lock.get('model_rel')} script={configured_model_rel}"
)
model_rel = lock["model_rel"]
model_path = repo_root / model_rel
if not model_path.is_file():
raise SystemExit(f"Model file not found: {model_path}")
actual_model_sha = sha256(model_path)
if actual_model_sha != lock["model_sha256"]:
raise SystemExit(
f"Model checksum mismatch for {model_path}. expected={lock['model_sha256']} actual={actual_model_sha}"
)
validated_audio = {}
for key in ("short", "medium", "long"):
wav_path = audio_dir / f"{key}.wav"
if not wav_path.is_file():
raise SystemExit(f"Audio file missing: {wav_path}")
expected = lock["audio"][key]
actual_sha = sha256(wav_path)
if actual_sha != expected["sha256"]:
raise SystemExit(
f"Checksum mismatch for {wav_path}. expected={expected['sha256']} actual={actual_sha}"
)
channels, sample_rate, sample_width, duration_s = inspect_wav(wav_path)
if channels != 1 or sample_rate != 16000 or sample_width != 2:
raise SystemExit(
f"Format mismatch for {wav_path}: channels={channels}, sample_rate={sample_rate}, sample_width={sample_width}"
)
validated_audio[key] = {
"path": str(wav_path),
"duration_s": duration_s,
"sha256": actual_sha,
}
print(
json.dumps(
{
"model_rel": model_rel,
"model_abs": str(model_path),
"model_sha256": actual_model_sha,
"audio": validated_audio,
},
indent=2,
sort_keys=True,
)
)
PY
}
write_config_file() {
local config_path="$1"
local validation_json="$2"
python3 - "${config_path}" "${validation_json}" "${VARIANT}" "${THREADS}" "${PROCESSORS}" "${WARMUP_RUNS}" "${MEASURED_RUNS}" "${REPO_ROOT}" "${CLI_BIN}" "${CLI_ARGS[@]}" <<'PY'
import datetime as dt
import json
import platform
import subprocess
import sys
from pathlib import Path
config_path = Path(sys.argv[1])
validation_path = Path(sys.argv[2])
variant = sys.argv[3]
threads = int(sys.argv[4])
processors = int(sys.argv[5])
warmup_runs = int(sys.argv[6])
measured_runs = int(sys.argv[7])
repo_root = sys.argv[8]
cli_bin = sys.argv[9]
cli_args = sys.argv[10:]
validation = json.loads(validation_path.read_text(encoding="utf-8"))
def run_cmd(args):
try:
return subprocess.check_output(args, stderr=subprocess.STDOUT, text=True).strip()
except Exception:
return ""
env = {
"git_commit": run_cmd(["git", "-C", repo_root, "rev-parse", "HEAD"]),
"git_short_commit": run_cmd(["git", "-C", repo_root, "rev-parse", "--short", "HEAD"]),
"sw_vers": run_cmd(["sw_vers"]),
"uname": run_cmd(["uname", "-a"]),
"hw_memsize": run_cmd(["sysctl", "-n", "hw.memsize"]),
"cpu_brand_string": run_cmd(["sysctl", "-n", "machdep.cpu.brand_string"]),
"xcodebuild_version": run_cmd(["xcodebuild", "-version"]),
"clang_version": run_cmd(["clang", "--version"]),
"cmake_version": run_cmd(["cmake", "--version"]),
"python_version": platform.python_version(),
}
cfg = {
"created_at_utc": dt.datetime.now(dt.timezone.utc).isoformat(),
"variant": variant,
"run_policy": {
"warmup_runs": warmup_runs,
"measured_runs": measured_runs,
"sequential_execution": True,
},
"model": {
"rel_path": validation["model_rel"],
"abs_path": validation["model_abs"],
"sha256": validation["model_sha256"],
},
"audio": validation["audio"],
"cli": {
"binary": cli_bin,
"args": cli_args,
"threads": threads,
"processors": processors,
},
"environment": env,
}
config_path.write_text(json.dumps(cfg, indent=2, sort_keys=True) + "\n", encoding="utf-8")
PY
}
run_one() {
local log_path="$1"
local meta_path="$2"
local audio_key="$3"
local audio_duration="$4"
local run_kind="$5"
local run_index="$6"
shift 6
local cmd=( "$@" )
python3 - "${log_path}" "${meta_path}" "${audio_key}" "${audio_duration}" "${run_kind}" "${run_index}" "${cmd[@]}" <<'PY'
import datetime as dt
import json
import re
import subprocess
import sys
import time
from pathlib import Path
log_path = Path(sys.argv[1])
meta_path = Path(sys.argv[2])
audio_key = sys.argv[3]
audio_duration_s = float(sys.argv[4])
run_kind = sys.argv[5]
run_index = int(sys.argv[6])
cmd = sys.argv[7:]
segment_re = re.compile(r"^\[\d{2}:\d{2}:\d{2}\.\d{3}\s+-->\s+\d{2}:\d{2}:\d{2}\.\d{3}\]")
start_utc = dt.datetime.now(dt.timezone.utc).isoformat()
start = time.perf_counter()
first_inference_s = None
with log_path.open("w", encoding="utf-8") as logf:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
assert proc.stdout is not None
for line in proc.stdout:
logf.write(line)
if first_inference_s is None and segment_re.search(line.strip()):
first_inference_s = time.perf_counter() - start
rc = proc.wait()
end = time.perf_counter()
end_utc = dt.datetime.now(dt.timezone.utc).isoformat()
meta = {
"audio_key": audio_key,
"audio_duration_s": audio_duration_s,
"run_kind": run_kind,
"run_index": run_index,
"command": cmd,
"log_path": str(log_path),
"start_utc": start_utc,
"end_utc": end_utc,
"wall_clock_runtime_s": end - start,
"first_inference_latency_s": first_inference_s,
"exit_code": rc,
}
meta_path.write_text(json.dumps(meta, indent=2, sort_keys=True) + "\n", encoding="utf-8")
sys.exit(rc)
PY
}
if [[ "${MODE}" == "create-lock" ]]; then
create_lock_file
exit 0
fi
if [[ "${#AUDIO_KEYS[@]}" -eq 0 ]]; then
AUDIO_KEYS=( "short" )
fi
mkdir -p "${RESULTS_ROOT}"
RUN_ID="$(date '+%Y%m%d_%H%M%S')"
RUN_DIR="${RESULTS_ROOT}/${RUN_ID}_${VARIANT}"
RAW_DIR="${RUN_DIR}/raw"
mkdir -p "${RAW_DIR}"
VALIDATION_JSON="${RUN_DIR}/validated_inputs.json"
validate_inputs_against_lock "${VALIDATION_JSON}"
write_config_file "${RUN_DIR}/config.json" "${VALIDATION_JSON}"
MODEL_PATH="$(python3 - "${VALIDATION_JSON}" <<'PY'
import json
import sys
from pathlib import Path
data = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
print(data["model_abs"])
PY
)"
echo "Benchmark run directory: ${RUN_DIR}"
echo "Variant: ${VARIANT}"
echo "Model: ${MODEL_PATH}"
echo "Warm-up runs: ${WARMUP_RUNS}, measured runs: ${MEASURED_RUNS}"
echo "Threads: ${THREADS}, processors: ${PROCESSORS}"
echo "Fixed CLI flags: ${CLI_ARGS[*]}"
echo "Audio set: ${AUDIO_KEYS[*]}"
for audio_key in "${AUDIO_KEYS[@]}"; do
audio_path="${AUDIO_DIR}/${audio_key}.wav"
audio_duration="$(python3 - "${VALIDATION_JSON}" "${audio_key}" <<'PY'
import json
import sys
from pathlib import Path
data = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8"))
print(data["audio"][sys.argv[2]]["duration_s"])
PY
)"
audio_run_dir="${RAW_DIR}/${audio_key}"
mkdir -p "${audio_run_dir}"
for i in $(seq 1 "${WARMUP_RUNS}"); do
printf -v idx "%02d" "${i}"
echo "[${audio_key}] warm-up ${i}/${WARMUP_RUNS}"
run_one \
"${audio_run_dir}/warmup_${idx}.log" \
"${audio_run_dir}/warmup_${idx}.meta.json" \
"${audio_key}" \
"${audio_duration}" \
"warmup" \
"${i}" \
"${CLI_BIN}" -m "${MODEL_PATH}" -f "${audio_path}" -t "${THREADS}" -p "${PROCESSORS}" "${CLI_ARGS[@]}"
done
for i in $(seq 1 "${MEASURED_RUNS}"); do
printf -v idx "%02d" "${i}"
echo "[${audio_key}] measured ${i}/${MEASURED_RUNS}"
run_one \
"${audio_run_dir}/run_${idx}.log" \
"${audio_run_dir}/run_${idx}.meta.json" \
"${audio_key}" \
"${audio_duration}" \
"measured" \
"${i}" \
"${CLI_BIN}" -m "${MODEL_PATH}" -f "${audio_path}" -t "${THREADS}" -p "${PROCESSORS}" "${CLI_ARGS[@]}"
done
done
PARSE_ARGS=(
"--run-dir" "${RUN_DIR}"
"--refs-dir" "${REFS_DIR}"
"--max-wer" "${MAX_WER}"
"--max-cer" "${MAX_CER}"
)
if [[ "${ENFORCE_CORRECTNESS}" == "1" ]]; then
PARSE_ARGS+=( "--enforce-correctness" )
fi
python3 "${SCRIPT_DIR}/parse_results.py" "${PARSE_ARGS[@]}"
echo "Completed benchmark parsing:"
echo " ${RUN_DIR}/runs.csv"
echo " ${RUN_DIR}/summary.csv"
echo " ${RUN_DIR}/summary.md"
echo " ${RUN_DIR}/correctness.json"