whisper : add --seg-len-hint to discourage progressively shorter segments

When processing long audio, whisper tends to produce progressively
shorter segments because timestamp tokens in the decoder prompt context
condition the model to insert more frequent segment breaks.

Add a seg_len_hint parameter (in ms) that thins timestamp tokens in
the rolling prompt context, keeping at most one per seg_len_hint
interval. This breaks the feedback loop while preserving text tokens
for continuity. The model can still break on natural boundaries
(speaker turns, pauses) — the hint only affects context conditioning,
not the actual segment creation.

Usage: --seg-len-hint 2000 (for ~2 second target segments)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Liz Fong-Jones 2026-04-03 21:29:08 -07:00
parent 95ea8f9bfb
commit 24a436d350
3 changed files with 33 additions and 1 deletions

View File

@ -41,6 +41,7 @@ struct whisper_params {
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t seg_len_hint = 0;
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
int32_t audio_ctx = 0;
@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); }
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(ARGV_NEXT); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); }
@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.seg_len_hint = params.seg_len_hint;
wparams.audio_ctx = params.audio_ctx;
wparams.debug_mode = params.debug_mode;

View File

@ -508,6 +508,7 @@ extern "C" {
int max_len; // max segment length in characters
bool split_on_word; // split on word rather than on token (when used with max_len)
int max_tokens; // max tokens per segment (0 = no limit)
int seg_len_hint; // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)
// [EXPERIMENTAL] speed-up techniques
// note: these can significantly reduce the quality of the output

View File

@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.max_len =*/ 0,
/*.split_on_word =*/ false,
/*.max_tokens =*/ 0,
/*.seg_len_hint =*/ 0,
/*.debug_mode =*/ false,
/*.audio_ctx =*/ 0,
@ -6896,6 +6897,9 @@ int whisper_full_with_state(
// calculate the maximum context budget for prompt history
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
// track last timestamp kept in prompt context for seg_len_hint thinning
int last_prompt_ts = 0;
// prepare prompt
{
std::vector<whisper_token> prompt_tokens;
@ -7585,9 +7589,32 @@ int whisper_full_with_state(
}
// Add newly decoded tokens to the rolling context
// When seg_len_hint is set, thin out timestamp tokens in the context to prevent
// the model from conditioning on frequent segment breaks (which causes
// progressively shorter segments)
if (!is_no_speech) {
const whisper_token token_beg = whisper_token_beg(ctx);
const whisper_token token_eot = whisper_token_eot(ctx);
// convert seg_len_hint from ms to 20ms timestamp steps
const int min_timestamp_gap = params.seg_len_hint / 20;
for (int i = 0; i < result_len; ++i) {
prompt_past1.push_back(tokens_cur[i].id);
const whisper_token id = tokens_cur[i].id;
if (id >= token_eot && id <= token_beg) {
// special non-timestamp token (eot, sot, etc.) — skip
continue;
}
if (min_timestamp_gap > 0 && id > token_beg) {
// timestamp token — only keep if enough time since last one
const int ts = id - token_beg;
if (ts - last_prompt_ts >= min_timestamp_gap) {
last_prompt_ts = ts;
prompt_past1.push_back(id);
}
continue;
}
// regular text token (or timestamp when seg_len_hint=0) — always keep
prompt_past1.push_back(id);
}
}