From 628466b06625758b1a7a46ce06ebf0d4d102cfc1 Mon Sep 17 00:00:00 2001 From: Achyut Krishna Byanjankar Date: Sat, 18 Apr 2026 17:08:16 -0700 Subject: [PATCH] whisper : skip decoding of zero-filled chunks on forced-language path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a specific language is forced (e.g. -l ru, -l es) and a 30-second decoder window is entirely zero-valued, whisper emits language-specific fallback tokens (bracketed music tags like [Música], fake subtitle-editor credits on -l ru). The auto-detect path handles silent chunks naturally. Add a chunk-level zero-PCM check at the top of the seek loop inside whisper_full_with_state. When the current window is all-zero and the caller forced a language, emit a single [BLANK_AUDIO] segment for that chunk and advance without running the encoder or decoder. Matches the approach endorsed in PR #1588 review ("skip entire segments when silence is detected"), using zero-PCM as a stricter and language- independent signal than no_speech_prob. The caller's original language intent is captured before the auto- detect block overwrites params.language, so the guard only fires when the user explicitly requested a specific language; auto-detect paths are unchanged. Fixes #1724 (residual hallucination on forced-language silence chunks not addressed by #2629) --- src/whisper.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/whisper.cpp b/src/whisper.cpp index 2f356da0..3f65c412 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6800,6 +6800,14 @@ int whisper_full_with_state( result_all.clear(); + // Capture whether the caller forced a specific language before + // the auto-detect block below overwrites params.language. + // ref: https://github.com/ggml-org/whisper.cpp/issues/1724 + const bool language_was_forced_by_caller = (params.language != nullptr + && strlen(params.language) > 0 + && strcmp(params.language, "auto") != 0 + && !params.detect_language); + if (n_samples > 0) { // compute log mel spectrogram if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { @@ -7009,6 +7017,43 @@ int whisper_full_with_state( break; } + // Chunk-level zero-silence guard. When the current 30-second + // window is entirely zero-valued and the caller forced a + // specific language, emit a [BLANK_AUDIO] segment and advance + // without running the encoder/decoder. Without this, forced- + // language decoding of silent chunks emits language-specific + // fallback tokens (e.g. "[Музыка]" on -l ru, "[Música]" on + // -l es) or model-trained subtitle-credit phrases on -l ru. + // This matches the approach endorsed by the maintainer in + // PR #1588 review ("skip entire segments when silence is + // detected"), using zero-PCM as a stricter signal than the + // language-dependent no_speech_prob. + // ref: https://github.com/ggml-org/whisper.cpp/issues/1724 + if (language_was_forced_by_caller && samples != nullptr) { + const int chunk_start = seek * (WHISPER_SAMPLE_RATE / 100); + const int chunk_end = std::min(n_samples, chunk_start + WHISPER_CHUNK_SIZE * WHISPER_SAMPLE_RATE); + if (chunk_start < chunk_end) { + bool chunk_is_zero = true; + for (int i = chunk_start; i < chunk_end; ++i) { + if (samples[i] != 0.0f) { + chunk_is_zero = false; + break; + } + } + if (chunk_is_zero) { + WHISPER_LOG_INFO("%s: chunk at seek=%d is zero-filled with forced language %s; emitting blank-audio and skipping decode (ref: #1724)\n", __func__, seek, params.language); + const int64_t t0 = seek; + const int64_t t1 = (int64_t) chunk_end * 100 / WHISPER_SAMPLE_RATE; + result_all.push_back({ t0, t1, " [BLANK_AUDIO]", 1.0f, {}, false }); + if (params.new_segment_callback && !ctx->params.dtw_token_timestamps) { + params.new_segment_callback(ctx, state, 1, params.new_segment_callback_user_data); + } + seek += WHISPER_CHUNK_SIZE * 100; + continue; + } + } + } + if (params.encoder_begin_callback) { if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) { WHISPER_LOG_ERROR("%s: encoder_begin_callback returned false - aborting\n", __func__);