Merge 4f2b6ff9ea into fc674574ca
This commit is contained in:
commit
a4e1e03fa2
|
|
@ -41,6 +41,7 @@ struct whisper_params {
|
|||
int32_t progress_step = 5;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t seg_len_hint = 0;
|
||||
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
||||
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
||||
int32_t audio_ctx = 0;
|
||||
|
|
@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|||
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); }
|
||||
|
|
@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
|||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
|
||||
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
||||
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||
|
|
@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
|
|||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.split_on_word = params.split_on_word;
|
||||
wparams.seg_len_hint = params.seg_len_hint;
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
wparams.debug_mode = params.debug_mode;
|
||||
|
|
|
|||
|
|
@ -78,6 +78,7 @@ struct whisper_params {
|
|||
int32_t progress_step = 5;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t seg_len_hint = 0;
|
||||
int32_t best_of = 2;
|
||||
int32_t beam_size = -1;
|
||||
int32_t audio_ctx = 0;
|
||||
|
|
@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
|
||||
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
||||
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||
|
|
@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|||
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
||||
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(argv[++i]); }
|
||||
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
||||
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
|
|
@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|||
{
|
||||
params.max_len = std::stoi(req.get_file_value("max_len").content);
|
||||
}
|
||||
if (req.has_file("seg_len_hint"))
|
||||
{
|
||||
params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
|
||||
}
|
||||
if (req.has_file("best_of"))
|
||||
{
|
||||
params.best_of = std::stoi(req.get_file_value("best_of").content);
|
||||
|
|
@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
|
|||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.seg_len_hint = params.seg_len_hint;
|
||||
|
||||
wparams.temperature = params.temperature;
|
||||
wparams.no_speech_thold = params.no_speech_thold;
|
||||
wparams.temperature_inc = params.temperature_inc;
|
||||
|
|
|
|||
|
|
@ -508,6 +508,7 @@ extern "C" {
|
|||
int max_len; // max segment length in characters
|
||||
bool split_on_word; // split on word rather than on token (when used with max_len)
|
||||
int max_tokens; // max tokens per segment (0 = no limit)
|
||||
int seg_len_hint; // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)
|
||||
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
// note: these can significantly reduce the quality of the output
|
||||
|
|
|
|||
|
|
@ -5936,6 +5936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|||
/*.max_len =*/ 0,
|
||||
/*.split_on_word =*/ false,
|
||||
/*.max_tokens =*/ 0,
|
||||
/*.seg_len_hint =*/ 0,
|
||||
|
||||
/*.debug_mode =*/ false,
|
||||
/*.audio_ctx =*/ 0,
|
||||
|
|
@ -6905,6 +6906,9 @@ int whisper_full_with_state(
|
|||
// calculate the maximum context budget for prompt history
|
||||
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
|
||||
|
||||
// track last timestamp kept in prompt context for seg_len_hint thinning
|
||||
int last_prompt_ts = 0;
|
||||
|
||||
// prepare prompt
|
||||
{
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
|
@ -7594,9 +7598,32 @@ int whisper_full_with_state(
|
|||
}
|
||||
|
||||
// Add newly decoded tokens to the rolling context
|
||||
// When seg_len_hint is set, thin out timestamp tokens in the context to prevent
|
||||
// the model from conditioning on frequent segment breaks (which causes
|
||||
// progressively shorter segments)
|
||||
if (!is_no_speech) {
|
||||
const whisper_token token_beg = whisper_token_beg(ctx);
|
||||
const whisper_token token_eot = whisper_token_eot(ctx);
|
||||
// convert seg_len_hint from ms to 20ms timestamp steps
|
||||
const int min_timestamp_gap = params.seg_len_hint / 20;
|
||||
|
||||
for (int i = 0; i < result_len; ++i) {
|
||||
prompt_past1.push_back(tokens_cur[i].id);
|
||||
const whisper_token id = tokens_cur[i].id;
|
||||
if (id >= token_eot && id <= token_beg) {
|
||||
// special non-timestamp token (eot, sot, etc.) — skip
|
||||
continue;
|
||||
}
|
||||
if (min_timestamp_gap > 0 && id > token_beg) {
|
||||
// timestamp token — only keep if enough time since last one
|
||||
const int ts = id - token_beg;
|
||||
if (ts - last_prompt_ts >= min_timestamp_gap) {
|
||||
last_prompt_ts = ts;
|
||||
prompt_past1.push_back(id);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// regular text token (or timestamp when seg_len_hint=0) — always keep
|
||||
prompt_past1.push_back(id);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue