Merge 4f2b6ff9ea into fc674574ca

2026-04-23 13:31:26 +05:30 · 2026-04-23 13:31:26 +05:30 · a4e1e03fa2
parent fc674574ca 4f2b6ff9ea
commit a4e1e03fa2
4 changed files with 42 additions and 1 deletions
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@ -41,6 +41,7 @@ struct whisper_params {
    int32_t progress_step = 5;
    int32_t max_context   = -1;
    int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
    int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
    int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
    int32_t audio_ctx     = 0;
@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (arg == "-d"    || arg == "--duration")             { params.duration_ms     = std::stoi(ARGV_NEXT); }
        else if (arg == "-mc"   || arg == "--max-context")          { params.max_context     = std::stoi(ARGV_NEXT); }
        else if (arg == "-ml"   || arg == "--max-len")              { params.max_len         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")         { params.seg_len_hint    = std::stoi(ARGV_NEXT); }
        else if (arg == "-bo"   || arg == "--best-of")              { params.best_of         = std::stoi(ARGV_NEXT); }
        else if (arg == "-bs"   || arg == "--beam-size")            { params.beam_size       = std::stoi(ARGV_NEXT); }
        else if (arg == "-ac"   || arg == "--audio-ctx")            { params.audio_ctx       = std::stoi(ARGV_NEXT); }
@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
    fprintf(stderr, "  -d  N,     --duration N           [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
    fprintf(stderr, "  -mc N,     --max-context N        [%-7d] maximum number of text context tokens to store\n", params.max_context);
    fprintf(stderr, "  -ml N,     --max-len N            [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N       [%-7d] target segment length in ms\n",                    params.seg_len_hint);
    fprintf(stderr, "  -sow,      --split-on-word        [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N            [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N          [%-7d] beam size for beam search\n",                      params.beam_size);
@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
+            wparams.seg_len_hint     = params.seg_len_hint;
            wparams.audio_ctx        = params.audio_ctx;

            wparams.debug_mode       = params.debug_mode;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -78,6 +78,7 @@ struct whisper_params {
    int32_t progress_step = 5;
    int32_t max_context   = -1;
    int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
    int32_t best_of       = 2;
    int32_t beam_size     = -1;
    int32_t audio_ctx     = 0;
@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N    [%-7d] target segment length in ms\n",                    params.seg_len_hint);
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")    { params.seg_len_hint    = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
        else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
    {
        params.max_len = std::stoi(req.get_file_value("max_len").content);
    }
+    if (req.has_file("seg_len_hint"))
+    {
+        params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
+    }
    if (req.has_file("best_of"))
    {
        params.best_of = std::stoi(req.get_file_value("best_of").content);
@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

+            wparams.seg_len_hint     = params.seg_len_hint;
+
            wparams.temperature      = params.temperature;
            wparams.no_speech_thold = params.no_speech_thold;
            wparams.temperature_inc  = params.temperature_inc;
--- a/include/whisper.h
+++ b/include/whisper.h
@ -508,6 +508,7 @@ extern "C" {
        int   max_len;          // max segment length in characters
        bool  split_on_word;    // split on word rather than on token (when used with max_len)
        int   max_tokens;       // max tokens per segment (0 = no limit)
+        int   seg_len_hint;     // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)

        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -5936,6 +5936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.max_len           =*/ 0,
        /*.split_on_word     =*/ false,
        /*.max_tokens        =*/ 0,
+        /*.seg_len_hint      =*/ 0,

        /*.debug_mode        =*/ false,
        /*.audio_ctx         =*/ 0,
@ -6905,6 +6906,9 @@ int whisper_full_with_state(
    // calculate the maximum context budget for prompt history
    const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);

+    // track last timestamp kept in prompt context for seg_len_hint thinning
+    int last_prompt_ts = 0;
+
    // prepare prompt
    {
        std::vector<whisper_token> prompt_tokens;
@ -7594,9 +7598,32 @@ int whisper_full_with_state(
            }

            // Add newly decoded tokens to the rolling context
+            // When seg_len_hint is set, thin out timestamp tokens in the context to prevent
+            // the model from conditioning on frequent segment breaks (which causes
+            // progressively shorter segments)
            if (!is_no_speech) {
+                const whisper_token token_beg = whisper_token_beg(ctx);
+                const whisper_token token_eot = whisper_token_eot(ctx);
+                // convert seg_len_hint from ms to 20ms timestamp steps
+                const int min_timestamp_gap = params.seg_len_hint / 20;
+
                for (int i = 0; i < result_len; ++i) {
-                    prompt_past1.push_back(tokens_cur[i].id);
+                    const whisper_token id = tokens_cur[i].id;
+                    if (id >= token_eot && id <= token_beg) {
+                        // special non-timestamp token (eot, sot, etc.) — skip
+                        continue;
+                    }
+                    if (min_timestamp_gap > 0 && id > token_beg) {
+                        // timestamp token — only keep if enough time since last one
+                        const int ts = id - token_beg;
+                        if (ts - last_prompt_ts >= min_timestamp_gap) {
+                            last_prompt_ts = ts;
+                            prompt_past1.push_back(id);
+                        }
+                        continue;
+                    }
+                    // regular text token (or timestamp when seg_len_hint=0) — always keep
+                    prompt_past1.push_back(id);
                }
            }