whisper : add --seg-len-hint to discourage progressively shorter segments

When processing long audio, whisper tends to produce progressively shorter segments because timestamp tokens in the decoder prompt context condition the model to insert more frequent segment breaks. Add a seg_len_hint parameter (in ms) that thins timestamp tokens in the rolling prompt context, keeping at most one per seg_len_hint interval. This breaks the feedback loop while preserving text tokens for continuity. The model can still break on natural boundaries (speaker turns, pauses) — the hint only affects context conditioning, not the actual segment creation. Usage: --seg-len-hint 2000 (for ~2 second target segments) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 21:29:08 -07:00 · 2026-04-03 21:29:08 -07:00 · 24a436d350
parent 95ea8f9bfb
commit 24a436d350
3 changed files with 33 additions and 1 deletions
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@ -41,6 +41,7 @@ struct whisper_params {
    int32_t progress_step = 5;
    int32_t max_context   = -1;
    int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
    int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
    int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
    int32_t audio_ctx     = 0;
@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (arg == "-d"    || arg == "--duration")             { params.duration_ms     = std::stoi(ARGV_NEXT); }
        else if (arg == "-mc"   || arg == "--max-context")          { params.max_context     = std::stoi(ARGV_NEXT); }
        else if (arg == "-ml"   || arg == "--max-len")              { params.max_len         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")         { params.seg_len_hint    = std::stoi(ARGV_NEXT); }
        else if (arg == "-bo"   || arg == "--best-of")              { params.best_of         = std::stoi(ARGV_NEXT); }
        else if (arg == "-bs"   || arg == "--beam-size")            { params.beam_size       = std::stoi(ARGV_NEXT); }
        else if (arg == "-ac"   || arg == "--audio-ctx")            { params.audio_ctx       = std::stoi(ARGV_NEXT); }
@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
    fprintf(stderr, "  -d  N,     --duration N           [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
    fprintf(stderr, "  -mc N,     --max-context N        [%-7d] maximum number of text context tokens to store\n", params.max_context);
    fprintf(stderr, "  -ml N,     --max-len N            [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N       [%-7d] target segment length in ms\n",                    params.seg_len_hint);
    fprintf(stderr, "  -sow,      --split-on-word        [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N            [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N          [%-7d] beam size for beam search\n",                      params.beam_size);
@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
+            wparams.seg_len_hint     = params.seg_len_hint;
            wparams.audio_ctx        = params.audio_ctx;

            wparams.debug_mode       = params.debug_mode;
--- a/include/whisper.h
+++ b/include/whisper.h
@ -508,6 +508,7 @@ extern "C" {
        int   max_len;          // max segment length in characters
        bool  split_on_word;    // split on word rather than on token (when used with max_len)
        int   max_tokens;       // max tokens per segment (0 = no limit)
+        int   seg_len_hint;     // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)

        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.max_len           =*/ 0,
        /*.split_on_word     =*/ false,
        /*.max_tokens        =*/ 0,
+        /*.seg_len_hint      =*/ 0,

        /*.debug_mode        =*/ false,
        /*.audio_ctx         =*/ 0,
@ -6896,6 +6897,9 @@ int whisper_full_with_state(
    // calculate the maximum context budget for prompt history
    const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);

+    // track last timestamp kept in prompt context for seg_len_hint thinning
+    int last_prompt_ts = 0;
+
    // prepare prompt
    {
        std::vector<whisper_token> prompt_tokens;
@ -7585,9 +7589,32 @@ int whisper_full_with_state(
            }

            // Add newly decoded tokens to the rolling context
+            // When seg_len_hint is set, thin out timestamp tokens in the context to prevent
+            // the model from conditioning on frequent segment breaks (which causes
+            // progressively shorter segments)
            if (!is_no_speech) {
+                const whisper_token token_beg = whisper_token_beg(ctx);
+                const whisper_token token_eot = whisper_token_eot(ctx);
+                // convert seg_len_hint from ms to 20ms timestamp steps
+                const int min_timestamp_gap = params.seg_len_hint / 20;
+
                for (int i = 0; i < result_len; ++i) {
-                    prompt_past1.push_back(tokens_cur[i].id);
+                    const whisper_token id = tokens_cur[i].id;
+                    if (id >= token_eot && id <= token_beg) {
+                        // special non-timestamp token (eot, sot, etc.) — skip
+                        continue;
+                    }
+                    if (min_timestamp_gap > 0 && id > token_beg) {
+                        // timestamp token — only keep if enough time since last one
+                        const int ts = id - token_beg;
+                        if (ts - last_prompt_ts >= min_timestamp_gap) {
+                            last_prompt_ts = ts;
+                            prompt_past1.push_back(id);
+                        }
+                        continue;
+                    }
+                    // regular text token (or timestamp when seg_len_hint=0) — always keep
+                    prompt_past1.push_back(id);
                }
            }