This commit is contained in:
Liz Fong-Jones 2026-04-23 13:31:26 +05:30 committed by GitHub
commit a4e1e03fa2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 42 additions and 1 deletions

View File

@ -41,6 +41,7 @@ struct whisper_params {
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t seg_len_hint = 0;
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
int32_t audio_ctx = 0;
@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); }
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(ARGV_NEXT); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); }
@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.seg_len_hint = params.seg_len_hint;
wparams.audio_ctx = params.audio_ctx;
wparams.debug_mode = params.debug_mode;

View File

@ -78,6 +78,7 @@ struct whisper_params {
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t seg_len_hint = 0;
int32_t best_of = 2;
int32_t beam_size = -1;
int32_t audio_ctx = 0;
@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(argv[++i]); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
{
params.max_len = std::stoi(req.get_file_value("max_len").content);
}
if (req.has_file("seg_len_hint"))
{
params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
}
if (req.has_file("best_of"))
{
params.best_of = std::stoi(req.get_file_value("best_of").content);
@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.seg_len_hint = params.seg_len_hint;
wparams.temperature = params.temperature;
wparams.no_speech_thold = params.no_speech_thold;
wparams.temperature_inc = params.temperature_inc;

View File

@ -508,6 +508,7 @@ extern "C" {
int max_len; // max segment length in characters
bool split_on_word; // split on word rather than on token (when used with max_len)
int max_tokens; // max tokens per segment (0 = no limit)
int seg_len_hint; // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)
// [EXPERIMENTAL] speed-up techniques
// note: these can significantly reduce the quality of the output

View File

@ -5936,6 +5936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.max_len =*/ 0,
/*.split_on_word =*/ false,
/*.max_tokens =*/ 0,
/*.seg_len_hint =*/ 0,
/*.debug_mode =*/ false,
/*.audio_ctx =*/ 0,
@ -6905,6 +6906,9 @@ int whisper_full_with_state(
// calculate the maximum context budget for prompt history
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
// track last timestamp kept in prompt context for seg_len_hint thinning
int last_prompt_ts = 0;
// prepare prompt
{
std::vector<whisper_token> prompt_tokens;
@ -7594,9 +7598,32 @@ int whisper_full_with_state(
}
// Add newly decoded tokens to the rolling context
// When seg_len_hint is set, thin out timestamp tokens in the context to prevent
// the model from conditioning on frequent segment breaks (which causes
// progressively shorter segments)
if (!is_no_speech) {
const whisper_token token_beg = whisper_token_beg(ctx);
const whisper_token token_eot = whisper_token_eot(ctx);
// convert seg_len_hint from ms to 20ms timestamp steps
const int min_timestamp_gap = params.seg_len_hint / 20;
for (int i = 0; i < result_len; ++i) {
prompt_past1.push_back(tokens_cur[i].id);
const whisper_token id = tokens_cur[i].id;
if (id >= token_eot && id <= token_beg) {
// special non-timestamp token (eot, sot, etc.) — skip
continue;
}
if (min_timestamp_gap > 0 && id > token_beg) {
// timestamp token — only keep if enough time since last one
const int ts = id - token_beg;
if (ts - last_prompt_ts >= min_timestamp_gap) {
last_prompt_ts = ts;
prompt_past1.push_back(id);
}
continue;
}
// regular text token (or timestamp when seg_len_hint=0) — always keep
prompt_past1.push_back(id);
}
}