From e6edc1415bc45711e763984c3edfebc4ba6b2ef9 Mon Sep 17 00:00:00 2001
From: SakuzyPeng <2467654814@qq.com>
Date: Thu, 22 Jan 2026 02:25:53 +0800
Subject: [PATCH] feat(cli): add word-level LRC output with UTF-8 fix

Add new -olrcw/--output-lrc-word option for word-level LRC output with
inline timestamps per token.

Key changes:
- Add output_lrc_word parameter and CLI option
- Implement output_lrc_word() function with per-token timestamps
- Fix UTF-8 multi-byte character handling (merge continuation bytes)
- Enable token_timestamps when output_lrc_word is set
- Handle diarize speaker prefix without breaking LRC format
- Update README.md with new option

The UTF-8 fix addresses issue #1798 where CJK characters were split
across tokens with timestamps inserted between bytes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/cli/README.md |   1 +
 examples/cli/cli.cpp   | 106 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/examples/cli/README.md b/examples/cli/README.md
index 65285c3c..adb7e4b3 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -37,6 +37,7 @@ options:
   -ovtt,     --output-vtt        [false  ] output result in a vtt file
   -osrt,     --output-srt        [false  ] output result in a srt file
   -olrc,     --output-lrc        [false  ] output result in a lrc file
+  -olrcw,    --output-lrc-word   [false  ] output result in a word-level lrc file
   -owts,     --output-words      [false  ] output script for generating karaoke video
   -fp,       --font-path         [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
   -ocsv,     --output-csv        [false  ] output result in a CSV file
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
index 4e84c1b2..d803c64e 100644
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@@ -68,6 +68,7 @@ struct whisper_params {
     bool output_jsn      = false;
     bool output_jsn_full = false;
     bool output_lrc      = false;
+    bool output_lrc_word = false;  // word-level LRC output
     bool no_prints       = false;
     bool print_special   = false;
     bool print_colors    = false;
@@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-osrt" || arg == "--output-srt")           { params.output_srt      = true; }
         else if (arg == "-owts" || arg == "--output-words")         { params.output_wts      = true; }
         else if (arg == "-olrc" || arg == "--output-lrc")           { params.output_lrc      = true; }
+        else if (arg == "-olrcw"|| arg == "--output-lrc-word")     { params.output_lrc_word = true; }
         else if (arg == "-fp"   || arg == "--font-path")            { params.font_path       = ARGV_NEXT; }
         else if (arg == "-ocsv" || arg == "--output-csv")           { params.output_csv      = true; }
         else if (arg == "-oj"   || arg == "--output-json")          { params.output_jsn      = true; }
@@ -260,6 +262,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     fprintf(stderr, "  -ovtt,     --output-vtt           [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
     fprintf(stderr, "  -osrt,     --output-srt           [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
     fprintf(stderr, "  -olrc,     --output-lrc           [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
+    fprintf(stderr, "  -olrcw,    --output-lrc-word      [%-7s] output result in a word-level lrc file\n",         params.output_lrc_word ? "true" : "false");
     fprintf(stderr, "  -owts,     --output-words         [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
     fprintf(stderr, "  -fp,       --font-path            [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
     fprintf(stderr, "  -ocsv,     --output-csv           [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
@@ -922,6 +925,106 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
     }
 }
 
+// Helper: check if byte is a UTF-8 continuation byte (10xxxxxx)
+static bool is_utf8_continuation(unsigned char c) {
+    return (c & 0xC0) == 0x80;
+}
+
+// Helper: format timestamp and append text to line
+static void append_lrc_word(std::string & line, int64_t timestamp, const std::string & text) {
+    if (text.empty() || timestamp < 0) {
+        return;
+    }
+
+    int64_t msec = timestamp * 10;
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) (msec / 10));
+
+    line += "[";
+    line += buf;
+    line += "]";
+    line += text;
+}
+
+// Word-level LRC output with inline timestamps
+static void output_lrc_word(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    fout << "[by:whisper.cpp]\n";
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        std::string line = "";
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+
+        // Get speaker prefix if diarize is enabled (will be prepended to first word)
+        std::string speaker_prefix = "";
+        if (params.diarize && pcmf32s.size() == 2) {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+            speaker_prefix = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        std::string pending_text = "";
+        int64_t pending_timestamp = -1;
+        bool is_first_word = true;
+
+        for (int j = 0; j < n_tokens; ++j) {
+            const char * token_text = whisper_full_get_token_text(ctx, i, j);
+            whisper_token_data token_data = whisper_full_get_token_data(ctx, i, j);
+
+            // Skip special tokens (like [BLANK], timestamps, etc.)
+            if (token_data.id >= whisper_token_eot(ctx)) {
+                continue;
+            }
+
+            // Skip empty tokens
+            if (!token_text || !token_text[0]) {
+                continue;
+            }
+
+            // Use DTW timestamp if available, otherwise use t0
+            int64_t t = (token_data.t_dtw >= 0) ? token_data.t_dtw : token_data.t0;
+            if (t < 0) {
+                // Fallback to segment start time if token timestamp is not available
+                t = whisper_full_get_segment_t0(ctx, i);
+            }
+
+            // Check if this token starts with a UTF-8 continuation byte
+            bool is_continuation = is_utf8_continuation((unsigned char)token_text[0]);
+
+            if (is_continuation && !pending_text.empty()) {
+                // This token is a continuation of a multi-byte UTF-8 character
+                // Append to pending text without adding a new timestamp
+                pending_text += token_text;
+            } else {
+                // Flush pending text with its timestamp
+                append_lrc_word(line, pending_timestamp, pending_text);
+
+                // Start new pending, prepend speaker to first word
+                if (is_first_word && !speaker_prefix.empty()) {
+                    pending_text = speaker_prefix + token_text;
+                    is_first_word = false;
+                } else {
+                    pending_text = token_text;
+                }
+                pending_timestamp = t;
+            }
+        }
+
+        // Flush remaining pending text
+        append_lrc_word(line, pending_timestamp, pending_text);
+
+        // Only output if we have actual content (line starts with timestamp)
+        if (!line.empty()) {
+            fout << line << "\n";
+        }
+    }
+}
+
 
 static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
 
@@ -1182,7 +1285,7 @@ int main(int argc, char ** argv) {
             wparams.offset_ms        = params.offset_t_ms;
             wparams.duration_ms      = params.duration_ms;
 
-            wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
+            wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.output_lrc_word || params.max_len > 0;
             wparams.thold_pt         = params.word_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.split_on_word    = params.split_on_word;
@@ -1294,6 +1397,7 @@ int main(int argc, char ** argv) {
             output_ext(csv, pcmf32s);
             output_func(output_json, ".json", params.output_jsn, pcmf32s);
             output_ext(lrc, pcmf32s);
+            output_func(output_lrc_word, ".word.lrc", params.output_lrc_word, pcmf32s);
             output_func(output_score, ".score.txt", params.log_score, pcmf32s);
 
 #undef output_ext