diff --git a/src/whisper.cpp b/src/whisper.cpp index 5b6e4b4b..796bccfb 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) { return txt[0] == ' '; } +// Count UTF-8 characters (not bytes) in a string +static int utf8_len(const char * str) { + int count = 0; + while (*str) { + // Skip continuation bytes (10xxxxxx) + if ((*str & 0xC0) != 0x80) { + count++; + } + str++; + } + return count; +} + static void whisper_exp_compute_token_level_timestamps_dtw( struct whisper_context * ctx, struct whisper_state * state, @@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta } const auto txt = whisper_token_to_str(&ctx, token.id); - const int cur = strlen(txt); + const int cur = utf8_len(txt); // Use UTF-8 character count instead of byte count if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) { state.result_all.back().text = std::move(text);