From 357394b6789d0a2b853509e5469bd89ed2fa20d6 Mon Sep 17 00:00:00 2001 From: tubome Date: Sun, 1 Feb 2026 00:59:01 +0900 Subject: [PATCH 1/2] cli: add --stdin-mode for continuous file processing Add --stdin-mode option that allows whisper-cli to load the model once and process multiple audio files by reading paths from stdin. This is useful for: - Batch processing without model reload overhead - Workaround for whisper-server handle leak issues (#3358) - Integration with external programs via stdin/stdout Output format (JSON, one per line): {"status":"ready"} {"file":"path","status":"success","text":"..."} {"file":"path","status":"error","error":"..."} {"status":"exit"} Usage: whisper-cli --stdin-mode --model model.bin --language ja # sends file paths via stdin, receives JSON results via stdout --- examples/cli/cli.cpp | 209 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 195 insertions(+), 14 deletions(-) diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 4e84c1b2..9e17d8a5 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,9 @@ struct whisper_params { float vad_max_speech_duration_s = FLT_MAX; int vad_speech_pad_ms = 30; float vad_samples_overlap = 0.1f; + + // Stdin mode for dynamic file loading + bool stdin_mode = false; }; static void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -217,6 +221,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); } else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); } else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); } + // Stdin mode + else if ( arg == "--stdin-mode") { params.stdin_mode = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -302,6 +308,8 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params std::to_string(params.vad_max_speech_duration_s).c_str()); fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); + fprintf(stderr, "\nStdin mode options:\n"); + fprintf(stderr, " --stdin-mode [%-7s] read file paths from stdin (one per line, JSON output)\n", params.stdin_mode ? "true" : "false"); fprintf(stderr, "\n"); } @@ -925,6 +933,35 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const static void cb_log_disable(enum ggml_log_level , const char * , void * ) { } +// escape string for JSON output (handles quotes, backslashes, newlines, tabs, etc.) +static std::string escape_json_string(const std::string & s) { + std::string result; + result.reserve(s.size()); + for (size_t i = 0; i < s.size(); ++i) { + char c = s[i]; + switch (c) { + case '"': result += "\\\""; break; + case '\\': result += "\\\\"; break; + case '\n': result += "\\n"; break; + case '\r': result += "\\r"; break; + case '\t': result += "\\t"; break; + case '\b': result += "\\b"; break; + case '\f': result += "\\f"; break; + default: + if (static_cast(c) < 0x20) { + // Control character - use \uXXXX format + char buf[8]; + snprintf(buf, sizeof(buf), "\\u%04x", static_cast(c)); + result += buf; + } else { + result += c; + } + break; + } + } + return result; +} + int main(int argc, char ** argv) { ggml_backend_load_all(); @@ -971,23 +1008,25 @@ int main(int argc, char ** argv) { return 1; } - // remove non-existent files - for (auto it = params.fname_inp.begin(); it != params.fname_inp.end();) { - const auto fname_inp = it->c_str(); + // remove non-existent files (skip in stdin-mode) + if (!params.stdin_mode) { + for (auto it = params.fname_inp.begin(); it != params.fname_inp.end();) { + const auto fname_inp = it->c_str(); - if (*it != "-" && !is_file_exist(fname_inp)) { - fprintf(stderr, "error: input file not found '%s'\n", fname_inp); - it = params.fname_inp.erase(it); - continue; + if (*it != "-" && !is_file_exist(fname_inp)) { + fprintf(stderr, "error: input file not found '%s'\n", fname_inp); + it = params.fname_inp.erase(it); + continue; + } + + it++; } - it++; - } - - if (params.fname_inp.empty()) { - fprintf(stderr, "error: no input files specified\n"); - whisper_print_usage(argc, argv, params); - return 2; + if (params.fname_inp.empty()) { + fprintf(stderr, "error: no input files specified\n"); + whisper_print_usage(argc, argv, params); + return 2; + } } if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) { @@ -1069,6 +1108,148 @@ int main(int argc, char ** argv) { } } + // stdin-mode: read file paths from stdin and process each + if (params.stdin_mode) { + // Output ready signal + printf("{\"status\":\"ready\"}\n"); + fflush(stdout); + + std::string line; + while (std::getline(std::cin, line)) { + // Trim whitespace + while (!line.empty() && (line.back() == '\r' || line.back() == '\n' || line.back() == ' ' || line.back() == '\t')) { + line.pop_back(); + } + while (!line.empty() && (line.front() == ' ' || line.front() == '\t')) { + line.erase(0, 1); + } + + // Skip empty lines + if (line.empty()) { + continue; + } + + // Exit commands + if (line == "exit" || line == "quit") { + printf("{\"status\":\"exit\"}\n"); + fflush(stdout); + break; + } + + std::string escaped_file = escape_json_string(line); + + // Check file existence + if (!is_file_exist(line.c_str())) { + printf("{\"file\":\"%s\",\"status\":\"error\",\"error\":\"file not found\"}\n", escaped_file.c_str()); + fflush(stdout); + continue; + } + + // Read audio data + std::vector pcmf32; + std::vector> pcmf32s; + if (!::read_audio_data(line, pcmf32, pcmf32s, params.diarize)) { + printf("{\"file\":\"%s\",\"status\":\"error\",\"error\":\"failed to read audio\"}\n", escaped_file.c_str()); + fflush(stdout); + continue; + } + + // Run inference + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + + const bool use_grammar = (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty()); + wparams.strategy = (params.beam_size > 1 || use_grammar) ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY; + + wparams.print_realtime = false; + wparams.print_progress = false; + wparams.print_timestamps = false; + wparams.print_special = false; + wparams.translate = params.translate; + wparams.language = params.language.c_str(); + wparams.detect_language = params.detect_language; + wparams.n_threads = params.n_threads; + wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; + wparams.offset_ms = params.offset_t_ms; + wparams.duration_ms = params.duration_ms; + + wparams.token_timestamps = params.max_len > 0; + wparams.thold_pt = params.word_thold; + wparams.max_len = params.max_len; + wparams.split_on_word = params.split_on_word; + wparams.audio_ctx = params.audio_ctx; + + wparams.debug_mode = params.debug_mode; + + wparams.tdrz_enable = params.tinydiarize; + + wparams.suppress_regex = params.suppress_regex.empty() ? nullptr : params.suppress_regex.c_str(); + + wparams.initial_prompt = params.prompt.c_str(); + wparams.carry_initial_prompt = params.carry_initial_prompt; + + wparams.greedy.best_of = params.best_of; + wparams.beam_search.beam_size = params.beam_size; + + wparams.temperature_inc = params.no_fallback ? 0.0f : params.temperature_inc; + wparams.temperature = params.temperature; + + wparams.entropy_thold = params.entropy_thold; + wparams.logprob_thold = params.logprob_thold; + wparams.no_speech_thold = params.no_speech_thold; + + wparams.no_timestamps = params.no_timestamps; + + wparams.suppress_nst = params.suppress_nst; + + wparams.vad = params.vad; + wparams.vad_model_path = params.vad_model.c_str(); + + wparams.vad_params.threshold = params.vad_threshold; + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wparams.vad_params.samples_overlap = params.vad_samples_overlap; + + const auto & grammar_parsed = params.grammar_parsed; + auto grammar_rules = grammar_parsed.c_rules(); + + if (use_grammar) { + if (grammar_parsed.symbol_ids.find(params.grammar_rule) != grammar_parsed.symbol_ids.end()) { + wparams.grammar_rules = grammar_rules.data(); + wparams.n_grammar_rules = grammar_rules.size(); + wparams.i_start_rule = grammar_parsed.symbol_ids.at(params.grammar_rule); + wparams.grammar_penalty = params.grammar_penalty; + } + } + + if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) { + printf("{\"file\":\"%s\",\"status\":\"error\",\"error\":\"failed to process audio\"}\n", escaped_file.c_str()); + fflush(stdout); + continue; + } + + // Collect all text from segments + std::string result_text; + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + if (text) { + result_text += text; + } + } + + // Output success result + std::string escaped_text = escape_json_string(result_text); + printf("{\"file\":\"%s\",\"status\":\"success\",\"text\":\"%s\"}\n", escaped_file.c_str(), escaped_text.c_str()); + fflush(stdout); + } + + whisper_free(ctx); + return 0; + } + + // Normal mode: process files from command line for (int f = 0; f < (int) params.fname_inp.size(); ++f) { const auto & fname_inp = params.fname_inp[f]; struct fout_factory { From c518eca2b081c7b161574c4bcfbe25c2aac7d2ad Mon Sep 17 00:00:00 2001 From: tubome Date: Fri, 6 Feb 2026 01:20:26 +0900 Subject: [PATCH 2/2] cli: disable stdout buffering for real-time piped output When whisper-cli is invoked via subprocess with piped stdout, the C runtime defaults to full buffering, causing segments to appear in batches. Adding setvbuf(stdout, NULL, _IONBF, 0) ensures each segment is flushed immediately. Co-Authored-By: Claude Opus 4.5 --- examples/cli/cli.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 9e17d8a5..9d8fb6c5 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -965,6 +965,9 @@ static std::string escape_json_string(const std::string & s) { int main(int argc, char ** argv) { ggml_backend_load_all(); + // Disable stdout buffering to ensure real-time output when piped + setvbuf(stdout, NULL, _IONBF, 0); + #if defined(_WIN32) // Set the console output code page to UTF-8, while command line arguments // are still encoded in the system's code page. In this way, we can print