From 166c20b473d5f4d04052e699f992f625ea2a2fdd Mon Sep 17 00:00:00 2001 From: Daniel Worthington-Bodart Date: Fri, 17 Apr 2026 12:36:27 +0100 Subject: [PATCH] whisper : add stateless VAD detect + explicit state reset for streaming (#3677) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit whisper_vad_detect_speech resets LSTM state on every call, which is correct for batch processing but prevents temporal continuity when calling per-chunk in a streaming loop. Add whisper_vad_detect_speech_no_reset (skips buffer clear) and whisper_vad_reset_state (explicit clear between utterances). Existing whisper_vad_detect_speech is now a thin wrapper — zero behavior change for current callers. Co-authored-by: Claude Opus 4.6 (1M context) --- include/whisper.h | 10 ++++++++++ src/whisper.cpp | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/include/whisper.h b/include/whisper.h index f4cc6bf7..b5dcdb29 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -695,6 +695,16 @@ extern "C" { const float * samples, int n_samples); + // Like whisper_vad_detect_speech, but does not reset LSTM state. + // Use for streaming: call whisper_vad_reset_state() between utterances. + WHISPER_API bool whisper_vad_detect_speech_no_reset( + struct whisper_vad_context * vctx, + const float * samples, + int n_samples); + + // Reset LSTM hidden/cell states to zero. + WHISPER_API void whisper_vad_reset_state(struct whisper_vad_context * vctx); + WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx); WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx); diff --git a/src/whisper.cpp b/src/whisper.cpp index 86bfafea..2f356da0 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -5083,7 +5083,11 @@ struct whisper_vad_context * whisper_vad_init_with_params( return vctx; } -bool whisper_vad_detect_speech( +void whisper_vad_reset_state(whisper_vad_context * vctx) { + ggml_backend_buffer_clear(vctx->buffer, 0); +} + +bool whisper_vad_detect_speech_no_reset( struct whisper_vad_context * vctx, const float * samples, int n_samples) { @@ -5095,9 +5099,6 @@ bool whisper_vad_detect_speech( WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples); WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks); - // Reset LSTM hidden/cell states - ggml_backend_buffer_clear(vctx->buffer, 0); - vctx->probs.resize(n_chunks); WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks); @@ -5165,6 +5166,14 @@ bool whisper_vad_detect_speech( return true; } +bool whisper_vad_detect_speech( + struct whisper_vad_context * vctx, + const float * samples, + int n_samples) { + whisper_vad_reset_state(vctx); + return whisper_vad_detect_speech_no_reset(vctx, samples, n_samples); +} + int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments) { return segments->data.size(); }