From 166c20b473d5f4d04052e699f992f625ea2a2fdd Mon Sep 17 00:00:00 2001
From: Daniel Worthington-Bodart <dan@bodar.com>
Date: Fri, 17 Apr 2026 12:36:27 +0100
Subject: [PATCH] whisper : add stateless VAD detect + explicit state reset for
 streaming (#3677)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

whisper_vad_detect_speech resets LSTM state on every call, which is
correct for batch processing but prevents temporal continuity when
calling per-chunk in a streaming loop.

Add whisper_vad_detect_speech_no_reset (skips buffer clear) and
whisper_vad_reset_state (explicit clear between utterances).
Existing whisper_vad_detect_speech is now a thin wrapper — zero
behavior change for current callers.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 include/whisper.h | 10 ++++++++++
 src/whisper.cpp   | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/include/whisper.h b/include/whisper.h
index f4cc6bf7..b5dcdb29 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -695,6 +695,16 @@ extern "C" {
                            const float * samples,
                                    int   n_samples);
 
+    // Like whisper_vad_detect_speech, but does not reset LSTM state.
+    // Use for streaming: call whisper_vad_reset_state() between utterances.
+    WHISPER_API bool whisper_vad_detect_speech_no_reset(
+            struct whisper_vad_context * vctx,
+                           const float * samples,
+                                   int   n_samples);
+
+    // Reset LSTM hidden/cell states to zero.
+    WHISPER_API void whisper_vad_reset_state(struct whisper_vad_context * vctx);
+
     WHISPER_API int     whisper_vad_n_probs(struct whisper_vad_context * vctx);
     WHISPER_API float * whisper_vad_probs  (struct whisper_vad_context * vctx);
 
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 86bfafea..2f356da0 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -5083,7 +5083,11 @@ struct whisper_vad_context * whisper_vad_init_with_params(
     return vctx;
 }
 
-bool whisper_vad_detect_speech(
+void whisper_vad_reset_state(whisper_vad_context * vctx) {
+    ggml_backend_buffer_clear(vctx->buffer, 0);
+}
+
+bool whisper_vad_detect_speech_no_reset(
         struct whisper_vad_context * vctx,
         const float * samples,
         int n_samples) {
@@ -5095,9 +5099,6 @@ bool whisper_vad_detect_speech(
     WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
     WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks);
 
-    // Reset LSTM hidden/cell states
-    ggml_backend_buffer_clear(vctx->buffer, 0);
-
     vctx->probs.resize(n_chunks);
     WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
 
@@ -5165,6 +5166,14 @@ bool whisper_vad_detect_speech(
     return true;
 }
 
+bool whisper_vad_detect_speech(
+        struct whisper_vad_context * vctx,
+        const float * samples,
+        int n_samples) {
+    whisper_vad_reset_state(vctx);
+    return whisper_vad_detect_speech_no_reset(vctx, samples, n_samples);
+}
+
 int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments) {
     return segments->data.size();
 }