From efd35de6ace3cf995058039df76bbecab244bc05 Mon Sep 17 00:00:00 2001
From: Oleg Orlov <orelcokolov@gmail.com>
Date: Wed, 8 Oct 2025 21:48:43 +0300
Subject: [PATCH] Full fix for --no-timestamps

---
 src/whisper.cpp | 82 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 586650ff..2c1569f0 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -6172,14 +6172,10 @@ static void whisper_process_logits(
         // suppress <|notimestamps|> token
         // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
         logits[vocab.token_not] = -INFINITY;
-        // NOTE: no longer suppressing timestamp tokens even when no_timestamps is true
-        // This allows the model to generate timestamps for better transcription quality
-        // The no_timestamps flag now only affects output formatting, not decoding
-        // if (params.no_timestamps) {
-        //     for (int i = vocab.token_beg; i < n_logits; ++i) {
-        //         logits[i] = -INFINITY;
-        //     }
-        // }
+        
+        // NOTE: We do NOT suppress timestamp tokens even when no_timestamps is true
+        // Suppressing them causes the model to lose its ability to segment properly
+        // The model needs timestamps internally for segmentation, even if we hide them in output
 
         // suppress sot and nosp tokens
         logits[vocab.token_sot]  = -INFINITY;
@@ -6931,21 +6927,10 @@ int whisper_full_with_state(
         }
     }
 
-    // first release distilled models require the "no_timestamps" token
-    {
-        const bool is_distil = ctx->model.hparams.n_text_layer == 2 && ctx->model.hparams.n_vocab != 51866;
-        if (is_distil && !params.no_timestamps) {
-            WHISPER_LOG_WARN("%s: using first release distilled models - forcing no_timestamps\n", __func__);
-            params.no_timestamps = true;
-        }
-    }
-
-    // NOTE: no longer adding <|notimestamps|> token even when no_timestamps is true
-    // This allows the model to use timestamp logic for better transcription quality
-    // The no_timestamps flag now only affects output formatting, not decoding
-    // if (params.no_timestamps) {
-    //     prompt_init.push_back(whisper_token_not(ctx));
-    // }
+    // NOTE: We do NOT add <|notimestamps|> token even when no_timestamps is true
+    // Adding it causes the model to hang or terminate early on some models
+    // Instead, we let the model generate timestamps internally for proper segmentation
+    // The no_timestamps flag only affects output formatting (in CLI)
 
     int seek = seek_start;
 
@@ -7324,7 +7309,7 @@ int whisper_full_with_state(
                            (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
                            (has_ts && seek + seek_delta + delta_min >= seek_end)       // end of audio reached (100ms)
                            ) {
-                            if (result_len == 0 && !params.no_timestamps) {
+                            if (result_len == 0) {
                                 if (seek + seek_delta + delta_min >= seek_end) {
                                     result_len = i + 1;
                                 } else {
@@ -7334,7 +7319,7 @@ int whisper_full_with_state(
                                 }
                             }
 
-                            if (params.single_segment || params.no_timestamps) {
+                            if (params.single_segment) {
                                 result_len = i + 1;
                                 seek_delta = 100*WHISPER_CHUNK_SIZE;
                             }
@@ -7359,6 +7344,46 @@ int whisper_full_with_state(
                         failed = true;
                         continue;
                     }
+                    
+                    // Additional repetition detection: check for exact repeating sequences
+                    // This catches stuck loops where the model repeats the same phrase over and over
+                    if (i >= 12) {  // Start checking very early
+                        const auto & tokens = decoder.sequence.tokens;
+                        
+                        // Try different pattern lengths from very small to medium
+                        for (int pattern_len = 3; pattern_len <= 30; pattern_len += 2) {
+                            const int needed_tokens = pattern_len * 2;  // Only need 2 repetitions now
+                            if (i + 1 < needed_tokens) continue;
+                            
+                            bool is_loop = true;
+                            
+                            // Check if tokens repeat exactly 2 times (more aggressive)
+                            for (int k = 0; k < pattern_len && is_loop; ++k) {
+                                const int idx_now = i - k;
+                                const int idx_prev = i - k - pattern_len;
+                                
+                                if (idx_prev < 0) {
+                                    is_loop = false;
+                                    break;
+                                }
+                                
+                                if (tokens[idx_now].id != tokens[idx_prev].id) {
+                                    is_loop = false;
+                                }
+                            }
+                            
+                            if (is_loop) {
+                                // Found 2x repetition - mark as failed to avoid adding more
+                                failed = true;
+                                break;
+                            }
+                        }
+                        
+                        if (failed) {
+                            continue;
+                        }
+                    }
+                    
                 }
 
                 // check if all decoders have finished (i.e. completed or failed)
@@ -7683,6 +7708,13 @@ int whisper_full_with_state(
                 seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
             }
 
+            // If best decoder failed (e.g. due to repetition loop), ensure we still move forward
+            // This prevents infinite loops where seek doesn't update
+            if (best_decoder.failed && seek_delta == 0) {
+                WHISPER_LOG_DEBUG("%s: decoder failed with seek_delta = 0, forcing forward progress\n", __func__);
+                seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
+            }
+
             // update audio window
             seek += seek_delta;