From 27101c01dcac1676e2b6422256233cd0f1f9ae28 Mon Sep 17 00:00:00 2001 From: texasich <101962694+texasich@users.noreply.github.com> Date: Mon, 25 May 2026 23:23:41 -0500 Subject: [PATCH] cli : merge tokens split across UTF-8 boundaries in JSON output (#3751) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cli : merge tokens split across UTF-8 boundaries in JSON output When a multi-byte UTF-8 codepoint (most commonly a CJK character, 3 bytes) is split across multiple whisper tokens, the -ojf/--output-json-full writer emitted each token's partial bytes as its own JSON string, producing invalid UTF-8 that chokes downstream parsers. Merge adjacent tokens in output_json whenever the accumulated text still ends on an incomplete UTF-8 sequence. The merged entry keeps the first token's id/p/t_dtw and extends t1 to the last absorbed token, which matches how segment text is assembled elsewhere. Refs #1798 * fix: address review — add braces for consistency, use full issue URL - Add braces to if/else chain for codebase consistency - Use full URL for issue #1798 reference Review: @danbev --------- Co-authored-by: texasich Co-authored-by: texasich --- examples/cli/cli.cpp | 84 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 11 deletions(-) diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 4e84c1b27..55cd71b4e 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -31,6 +31,39 @@ static void replace_all(std::string & s, const std::string & search, const std:: } } +// Returns the number of trailing continuation bytes still needed for `s` to end +// on a complete UTF-8 codepoint. Returns 0 if the tail of `s` is already a +// complete codepoint (or if the tail looks malformed and we should stop merging). +// Used to merge whisper tokens whose bytes split a multi-byte UTF-8 character +// (e.g. CJK), so the JSON output stays valid UTF-8. See https://github.com/ggml-org/whisper.cpp/issues/1798. +static int utf8_trailing_bytes_needed(const std::string & s) { + const int n = (int) s.size(); + int i = n - 1; + // walk back past continuation bytes (10xxxxxx) + while (i >= 0 && ((unsigned char) s[i] & 0xC0) == 0x80) { + --i; + } + if (i < 0) { + // all continuation bytes, or empty — nothing we can do + return 0; + } + const unsigned char c = (unsigned char) s[i]; + int expected; + if ((c & 0x80) == 0x00) { + expected = 1; // ASCII + } else if ((c & 0xE0) == 0xC0) { + expected = 2; + } else if ((c & 0xF0) == 0xE0) { + expected = 3; + } else if ((c & 0xF8) == 0xF0) { + expected = 4; + } else { + return 0; // malformed lead, give up + } + const int have = n - i; + return have >= expected ? 0 : (expected - have); +} + // command-line parameters struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); @@ -738,18 +771,47 @@ static void output_json( if (full) { start_arr("tokens"); const int n = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < n; ++j) { - auto token = whisper_full_get_token_data(ctx, i, j); - start_obj(nullptr); - value_s("text", whisper_token_to_str(ctx, token.id), false); - if(token.t0 > -1 && token.t1 > -1) { - // If we have per-token timestamps, write them out - times_o(token.t0, token.t1, false); + + // Merge adjacent tokens whose bytes together form a + // single UTF-8 codepoint. Multi-byte characters (CJK + // in particular) can end up split across whisper + // tokens, which used to produce invalid UTF-8 in the + // JSON string. Refs issue #1798. + struct merged_token { + std::string text; + whisper_token_data data; + int64_t t1; + }; + std::vector merged; + merged.reserve(n); + for (int j = 0; j < n; ) { + auto tok = whisper_full_get_token_data(ctx, i, j); + merged_token m{ whisper_token_to_str(ctx, tok.id), tok, tok.t1 }; + ++j; + while (j < n && utf8_trailing_bytes_needed(m.text) > 0) { + auto tok_next = whisper_full_get_token_data(ctx, i, j); + m.text += whisper_token_to_str(ctx, tok_next.id); + if (tok_next.t1 > -1) { + m.t1 = tok_next.t1; } - value_i("id", token.id, false); - value_f("p", token.p, false); - value_f("t_dtw", token.t_dtw, true); - end_obj(j == (n - 1)); + ++j; + } + merged.push_back(std::move(m)); + } + + const int nm = (int) merged.size(); + for (int j = 0; j < nm; ++j) { + const auto & mt = merged[j]; + start_obj(nullptr); + value_s("text", mt.text.c_str(), false); + if (mt.data.t0 > -1 && mt.t1 > -1) { + // If we have per-token timestamps, write them out + times_o(mt.data.t0, mt.t1, false); + } + value_i("id", mt.data.id, false); + value_f("p", mt.data.p, false); + value_f("t_dtw", mt.data.t_dtw, true); + end_obj(j == (nm - 1)); } end_arr(!params.diarize && !params.tinydiarize); }