whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592)
The current implementation in `whisper_wrap_segment()` uses `strlen()` to count bytes, not UTF-8 characters. When splitting segments at `max_len`, this can break multi-byte UTF-8 characters, resulting in invalid sequences displayed as `�` (U+FFFD replacement character).
This commit is contained in:
parent
2eeeba56e9
commit
f53dc74843
|
|
@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
|
|||
return txt[0] == ' ';
|
||||
}
|
||||
|
||||
// Count UTF-8 characters (not bytes) in a string
|
||||
static int utf8_len(const char * str) {
|
||||
int count = 0;
|
||||
while (*str) {
|
||||
// Skip continuation bytes (10xxxxxx)
|
||||
if ((*str & 0xC0) != 0x80) {
|
||||
count++;
|
||||
}
|
||||
str++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static void whisper_exp_compute_token_level_timestamps_dtw(
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_state * state,
|
||||
|
|
@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|||
}
|
||||
|
||||
const auto txt = whisper_token_to_str(&ctx, token.id);
|
||||
const int cur = strlen(txt);
|
||||
const int cur = utf8_len(txt); // Use UTF-8 character count instead of byte count
|
||||
|
||||
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
|
||||
state.result_all.back().text = std::move(text);
|
||||
|
|
|
|||
Loading…
Reference in New Issue