From b91fe0f390acc60cd4b94f6b63a0af7e1c6e9734 Mon Sep 17 00:00:00 2001 From: Martin Destagnol Date: Wed, 17 Sep 2025 10:23:49 -1000 Subject: [PATCH] Fixed VAD to work when using whisper_full_with_state --- src/whisper.cpp | 61 +++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 52de68c2..f150bfcf 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6652,8 +6652,8 @@ static bool whisper_vad( if (vad_segments->data.size() > 0) { state->has_vad_segments = true; - ctx->state->vad_segments.clear(); - ctx->state->vad_segments.reserve(vad_segments->data.size()); + state->vad_segments.clear(); + state->vad_segments.reserve(vad_segments->data.size()); // Initialize the time mapping table state->vad_mapping_table.clear(); @@ -6749,7 +6749,7 @@ static bool whisper_vad( WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n", __func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0); - ctx->state->vad_segments.push_back(segment); + state->vad_segments.push_back(segment); // Copy this speech segment memcpy(filtered_samples.data() + offset, samples + segment_start_samples, segment_length * sizeof(float)); @@ -6820,6 +6820,24 @@ int whisper_full_with_state( } } + std::vector vad_samples; + if (params.vad) + { + WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__); + if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples)) + { + WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__); + return -1; + } + if (vad_samples.empty()) + { + state->result_all.clear(); + return 0; + } + samples = vad_samples.data(); + n_samples = vad_samples.size(); + } + // auto-detect language if not specified if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) { std::vector probs(whisper_lang_max_id() + 1, 0.0f); @@ -7720,25 +7738,11 @@ int whisper_full_with_state( } int whisper_full( - struct whisper_context * ctx, - struct whisper_full_params params, - const float * samples, - int n_samples) { - - std::vector vad_samples; - if (params.vad) { - WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__); - if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) { - WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__); - return -1; - } - if (vad_samples.empty()) { - ctx->state->result_all.clear(); - return 0; - } - samples = vad_samples.data(); - n_samples = vad_samples.size(); - } + struct whisper_context *ctx, + struct whisper_full_params params, + const float *samples, + int n_samples) +{ return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples); } @@ -7753,19 +7757,6 @@ int whisper_full_parallel( return whisper_full(ctx, params, samples, n_samples); } - std::vector vad_samples; - if (params.vad) { - WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__); - if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) { - WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__); - return -1; - } - if (vad_samples.empty()) { - return 0; - } - samples = vad_samples.data(); - n_samples = vad_samples.size(); - } int ret = 0; // prepare separate states for each thread