Merge d3aca6a9de into fc674574ca

2026-04-20 10:27:20 +03:00 · 2026-04-20 10:27:20 +03:00 · 09f4775fc4
parent fc674574ca d3aca6a9de
commit 09f4775fc4
1 changed files with 16 additions and 2 deletions
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -8039,11 +8039,25 @@ whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segm
 }
 struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token];
+    whisper_token_data token = state->result_all[i_segment].tokens[i_token];
    // Map VAD-processed token timestamps back to the original audio timeline.
    // Without this, tokens report timestamps in the VAD-stripped timeline
    // (starting at 0 after whisper_full_with_state sees only the speech-
    // filtered samples), while segment timestamps are already mapped back by
    // whisper_full_get_segment_t0/t1_from_state. The two diverge when VAD
    // strips a non-speech prefix (e.g. music before speech).
    // ref: https://github.com/ggml-org/whisper.cpp/issues/3754
    if (state->has_vad_segments && !state->vad_mapping_table.empty()) {
        token.t0 = map_processed_to_original_time(token.t0, state->vad_mapping_table);
        token.t1 = map_processed_to_original_time(token.t1, state->vad_mapping_table);
    }
    return token;
 }
 struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token];
+    return whisper_full_get_token_data_from_state(ctx->state, i_segment, i_token);
 }
 float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {