whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592)

The current implementation in `whisper_wrap_segment()` uses `strlen()` to count bytes, not UTF-8 characters. When splitting segments at `max_len`, this can break multi-byte UTF-8 characters, resulting in invalid sequences displayed as `�` (U+FFFD replacement character).
2026-01-16 20:16:05 +08:00 · 2026-01-16 20:16:05 +08:00 · f53dc74843
parent 2eeeba56e9
commit f53dc74843
1 changed files with 14 additions and 1 deletions
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
    return txt[0] == ' ';
 }

+// Count UTF-8 characters (not bytes) in a string
+static int utf8_len(const char * str) {
+    int count = 0;
+    while (*str) {
+        // Skip continuation bytes (10xxxxxx)
+        if ((*str & 0xC0) != 0x80) {
+            count++;
+        }
+        str++;
+    }
+    return count;
+}
+
 static void whisper_exp_compute_token_level_timestamps_dtw(
            struct whisper_context * ctx,
              struct whisper_state * state,
@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
        }

        const auto txt = whisper_token_to_str(&ctx, token.id);
-        const int cur = strlen(txt);
+        const int cur = utf8_len(txt);  // Use UTF-8 character count instead of byte count

        if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
            state.result_all.back().text = std::move(text);