whisper : add stateless VAD detect + explicit state reset for streaming (#3677)
whisper_vad_detect_speech resets LSTM state on every call, which is correct for batch processing but prevents temporal continuity when calling per-chunk in a streaming loop. Add whisper_vad_detect_speech_no_reset (skips buffer clear) and whisper_vad_reset_state (explicit clear between utterances). Existing whisper_vad_detect_speech is now a thin wrapper — zero behavior change for current callers. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
95ea8f9bfb
commit
166c20b473
|
|
@ -695,6 +695,16 @@ extern "C" {
|
|||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Like whisper_vad_detect_speech, but does not reset LSTM state.
|
||||
// Use for streaming: call whisper_vad_reset_state() between utterances.
|
||||
WHISPER_API bool whisper_vad_detect_speech_no_reset(
|
||||
struct whisper_vad_context * vctx,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Reset LSTM hidden/cell states to zero.
|
||||
WHISPER_API void whisper_vad_reset_state(struct whisper_vad_context * vctx);
|
||||
|
||||
WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx);
|
||||
WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -5083,7 +5083,11 @@ struct whisper_vad_context * whisper_vad_init_with_params(
|
|||
return vctx;
|
||||
}
|
||||
|
||||
bool whisper_vad_detect_speech(
|
||||
void whisper_vad_reset_state(whisper_vad_context * vctx) {
|
||||
ggml_backend_buffer_clear(vctx->buffer, 0);
|
||||
}
|
||||
|
||||
bool whisper_vad_detect_speech_no_reset(
|
||||
struct whisper_vad_context * vctx,
|
||||
const float * samples,
|
||||
int n_samples) {
|
||||
|
|
@ -5095,9 +5099,6 @@ bool whisper_vad_detect_speech(
|
|||
WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
|
||||
WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks);
|
||||
|
||||
// Reset LSTM hidden/cell states
|
||||
ggml_backend_buffer_clear(vctx->buffer, 0);
|
||||
|
||||
vctx->probs.resize(n_chunks);
|
||||
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
|
||||
|
||||
|
|
@ -5165,6 +5166,14 @@ bool whisper_vad_detect_speech(
|
|||
return true;
|
||||
}
|
||||
|
||||
bool whisper_vad_detect_speech(
|
||||
struct whisper_vad_context * vctx,
|
||||
const float * samples,
|
||||
int n_samples) {
|
||||
whisper_vad_reset_state(vctx);
|
||||
return whisper_vad_detect_speech_no_reset(vctx, samples, n_samples);
|
||||
}
|
||||
|
||||
int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments) {
|
||||
return segments->data.size();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue