feat(cli): add word-level LRC output with UTF-8 fix

Add new -olrcw/--output-lrc-word option for word-level LRC output with
inline timestamps per token.

Key changes:
- Add output_lrc_word parameter and CLI option
- Implement output_lrc_word() function with per-token timestamps
- Fix UTF-8 multi-byte character handling (merge continuation bytes)
- Enable token_timestamps when output_lrc_word is set
- Handle diarize speaker prefix without breaking LRC format
- Update README.md with new option

The UTF-8 fix addresses issue #1798 where CJK characters were split
across tokens with timestamps inserted between bytes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
SakuzyPeng 2026-01-22 02:25:53 +08:00
parent 7aa8818647
commit e6edc1415b
2 changed files with 106 additions and 1 deletions

View File

@ -37,6 +37,7 @@ options:
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-olrc, --output-lrc [false ] output result in a lrc file
-olrcw, --output-lrc-word [false ] output result in a word-level lrc file
-owts, --output-words [false ] output script for generating karaoke video
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-ocsv, --output-csv [false ] output result in a CSV file

View File

@ -68,6 +68,7 @@ struct whisper_params {
bool output_jsn = false;
bool output_jsn_full = false;
bool output_lrc = false;
bool output_lrc_word = false; // word-level LRC output
bool no_prints = false;
bool print_special = false;
bool print_colors = false;
@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
else if (arg == "-olrcw"|| arg == "--output-lrc-word") { params.output_lrc_word = true; }
else if (arg == "-fp" || arg == "--font-path") { params.font_path = ARGV_NEXT; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
@ -260,6 +262,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false");
fprintf(stderr, " -olrcw, --output-lrc-word [%-7s] output result in a word-level lrc file\n", params.output_lrc_word ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
@ -922,6 +925,106 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
}
}
// Helper: check if byte is a UTF-8 continuation byte (10xxxxxx)
static bool is_utf8_continuation(unsigned char c) {
return (c & 0xC0) == 0x80;
}
// Helper: format timestamp and append text to line
static void append_lrc_word(std::string & line, int64_t timestamp, const std::string & text) {
if (text.empty() || timestamp < 0) {
return;
}
int64_t msec = timestamp * 10;
int64_t min = msec / (1000 * 60);
msec = msec - min * (1000 * 60);
int64_t sec = msec / 1000;
msec = msec - sec * 1000;
char buf[16];
snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) (msec / 10));
line += "[";
line += buf;
line += "]";
line += text;
}
// Word-level LRC output with inline timestamps
static void output_lrc_word(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
fout << "[by:whisper.cpp]\n";
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
std::string line = "";
const int n_tokens = whisper_full_n_tokens(ctx, i);
// Get speaker prefix if diarize is enabled (will be prepended to first word)
std::string speaker_prefix = "";
if (params.diarize && pcmf32s.size() == 2) {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
speaker_prefix = estimate_diarization_speaker(pcmf32s, t0, t1);
}
std::string pending_text = "";
int64_t pending_timestamp = -1;
bool is_first_word = true;
for (int j = 0; j < n_tokens; ++j) {
const char * token_text = whisper_full_get_token_text(ctx, i, j);
whisper_token_data token_data = whisper_full_get_token_data(ctx, i, j);
// Skip special tokens (like [BLANK], timestamps, etc.)
if (token_data.id >= whisper_token_eot(ctx)) {
continue;
}
// Skip empty tokens
if (!token_text || !token_text[0]) {
continue;
}
// Use DTW timestamp if available, otherwise use t0
int64_t t = (token_data.t_dtw >= 0) ? token_data.t_dtw : token_data.t0;
if (t < 0) {
// Fallback to segment start time if token timestamp is not available
t = whisper_full_get_segment_t0(ctx, i);
}
// Check if this token starts with a UTF-8 continuation byte
bool is_continuation = is_utf8_continuation((unsigned char)token_text[0]);
if (is_continuation && !pending_text.empty()) {
// This token is a continuation of a multi-byte UTF-8 character
// Append to pending text without adding a new timestamp
pending_text += token_text;
} else {
// Flush pending text with its timestamp
append_lrc_word(line, pending_timestamp, pending_text);
// Start new pending, prepend speaker to first word
if (is_first_word && !speaker_prefix.empty()) {
pending_text = speaker_prefix + token_text;
is_first_word = false;
} else {
pending_text = token_text;
}
pending_timestamp = t;
}
}
// Flush remaining pending text
append_lrc_word(line, pending_timestamp, pending_text);
// Only output if we have actual content (line starts with timestamp)
if (!line.empty()) {
fout << line << "\n";
}
}
}
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
@ -1182,7 +1285,7 @@ int main(int argc, char ** argv) {
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.output_lrc_word || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
@ -1294,6 +1397,7 @@ int main(int argc, char ** argv) {
output_ext(csv, pcmf32s);
output_func(output_json, ".json", params.output_jsn, pcmf32s);
output_ext(lrc, pcmf32s);
output_func(output_lrc_word, ".word.lrc", params.output_lrc_word, pcmf32s);
output_func(output_score, ".score.txt", params.log_score, pcmf32s);
#undef output_ext