feat: example stream mode for nodejs addon

2025-09-27 17:19:58 +08:00 · 2025-09-27 17:19:58 +08:00 · 5d3f20673b
parent 44fa2f647c
commit 5d3f20673b
8 changed files with 607 additions and 0 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -100,6 +100,7 @@ if (EMSCRIPTEN)
    add_subdirectory(bench.wasm)
 elseif(CMAKE_JS_VERSION)
    add_subdirectory(addon.node)
+    add_subdirectory(stream.node)
 else()
    add_subdirectory(cli)
    add_subdirectory(bench)
--- a/examples/stream.node/CMakeLists.txt
+++ b/examples/stream.node/CMakeLists.txt
@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.13)
+project(stream_addon)
+
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake ${CMAKE_MODULE_PATH})
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_definitions(-DNAPI_VERSION=4)
+include_directories(${CMAKE_JS_INC})
+
+set(TARGET stream.node)
+
+add_library(${TARGET} SHARED
+    addon.cpp
+    whisper-stream.cpp
+)
+
+set_target_properties(${TARGET} PROPERTIES
+    PREFIX ""
+    SUFFIX ".node"
+)
+include(DefaultTargetOptions)
+
+execute_process(COMMAND node -p "require('node-addon-api').include"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                OUTPUT_VARIABLE NODE_ADD_ON_API_DIR
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+string(REPLACE "\"" "" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
+string(REPLACE "\\\\" "/" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
+target_include_directories(${TARGET} PRIVATE ${NODE_ADD_ON_API_DIR})
+
+target_link_libraries(${TARGET}
+    whisper
+    common
+    ${CMAKE_THREAD_LIBS_INIT}
+)
+
+if(MSVC AND CMAKE_NODEJS_DEF AND CMAKE_JS_NODEJS_TARGET)
+    target_link_libraries(${TARGET} ${CMAKE_JS_NODEJS_TARGET})
+elseif(CMAKE_JS_LIB)
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_LIB} ${CMAKE_NODEJS_LIB_TARGET})
+    target_link_libraries(${TARGET} ${CMAKE_NODEJS_LIB_TARGET})
+endif()
--- a/examples/stream.node/addon.cpp
+++ b/examples/stream.node/addon.cpp
@ -0,0 +1,114 @@
+#include "addon.h" // Your header file for WhisperStreamWrapper
+#include "whisper-stream.h" // Your header file for the WhisperStream class
+
+// NOTE: The N-API wrapper handles errors by throwing JS exceptions, so this macro is not needed.
+// #define CHECK_STATUS(env, status, msg) ...
+
+// --- Implementation of the Wrapper ---
+
+Napi::Object WhisperStreamWrapper::Init(Napi::Env env, Napi::Object exports) {
+    Napi::Function func = DefineClass(env, "WhisperStream", {
+        InstanceMethod("startModel", &WhisperStreamWrapper::startModel),
+        InstanceMethod("processChunk", &WhisperStreamWrapper::ProcessChunk),
+        InstanceMethod("freeModel", &WhisperStreamWrapper::freeModel),
+    });
+
+    exports.Set("WhisperStream", func);
+    return exports;
+}
+
+WhisperStreamWrapper::WhisperStreamWrapper(const Napi::CallbackInfo& info) : Napi::ObjectWrap<WhisperStreamWrapper>(info) {
+}
+
+Napi::Value WhisperStreamWrapper::startModel(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+
+    if (info.Length() < 1 || !info[0].IsObject()) {
+        Napi::TypeError::New(env, "Expected a configuration object").ThrowAsJavaScriptException();
+        return env.Null();
+    }
+
+    Napi::Object js_params = info[0].As<Napi::Object>();
+    StreamParams params;
+
+    if (js_params.Has("modelPath")) {
+        params.model = js_params.Get("modelPath").As<Napi::String>();
+    } else {
+        Napi::TypeError::New(env, "Missing required parameter 'model'").ThrowAsJavaScriptException();
+        return env.Null();
+    }
+
+    if (js_params.Has("language")) params.language = js_params.Get("language").As<Napi::String>();
+    if (js_params.Has("nThreads")) params.n_threads = js_params.Get("nThreads").As<Napi::Number>();
+    if (js_params.Has("stepMs")) params.step_ms = js_params.Get("stepMs").As<Napi::Number>();
+    if (js_params.Has("lengthMs")) params.length_ms = js_params.Get("lengthMs").As<Napi::Number>();
+    if (js_params.Has("keepMs")) params.keep_ms = js_params.Get("keepMs").As<Napi::Number>();
+    if (js_params.Has("maxTokens")) params.max_tokens = js_params.Get("maxTokens").As<Napi::Number>();
+    if (js_params.Has("audioCtx")) params.audio_ctx = js_params.Get("audioCtx").As<Napi::Number>();
+    if (js_params.Has("vadThold")) params.vad_thold = js_params.Get("vadThold").As<Napi::Number>();
+    if (js_params.Has("beamSize")) params.beam_size = js_params.Get("beamSize").As<Napi::Number>();
+    if (js_params.Has("freqThold")) params.freq_thold = js_params.Get("freqThold").As<Napi::Number>();
+    if (js_params.Has("translate")) params.translate = js_params.Get("translate").As<Napi::Boolean>();
+    if (js_params.Has("noFallback")) params.no_fallback = js_params.Get("noFallback").As<Napi::Boolean>();
+    if (js_params.Has("printSpecial")) params.print_special = js_params.Get("printSpecial").As<Napi::Boolean>();
+    if (js_params.Has("noContext")) params.no_context = js_params.Get("noContext").As<Napi::Boolean>();
+    if (js_params.Has("noTimestamps")) params.no_timestamps = js_params.Get("noTimestamps").As<Napi::Boolean>();
+    if (js_params.Has("tinydiarize")) params.tinydiarize = js_params.Get("tinydiarize").As<Napi::Boolean>();
+    if (js_params.Has("saveAudio")) params.save_audio = js_params.Get("saveAudio").As<Napi::Boolean>();
+    if (js_params.Has("useGpu")) params.use_gpu = js_params.Get("useGpu").As<Napi::Boolean>();
+    if (js_params.Has("flashAttn")) params.flash_attn = js_params.Get("flashAttn").As<Napi::Boolean>();
+
+    if (this->whisperStream_) {
+        delete this->whisperStream_;
+    }
+
+    try {
+        this->whisperStream_ = new WhisperStream(params);
+        this->whisperStream_->init();
+    } catch (const std::runtime_error& e) {
+        Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
+        return env.Null();
+    }
+
+    return env.Undefined();
+}
+
+Napi::Value WhisperStreamWrapper::ProcessChunk(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+
+    if (!this->whisperStream_) {
+        Napi::Error::New(env, "Model not started. Call startModel() first.").ThrowAsJavaScriptException();
+        return env.Null();
+    }
+
+    if (info.Length() < 1 || !info[0].IsTypedArray() || info[0].As<Napi::TypedArray>().TypedArrayType() != napi_float32_array) {
+        Napi::TypeError::New(env, "Argument must be a Float32Array").ThrowAsJavaScriptException();
+        return env.Null();
+    }
+
+    Napi::Float32Array pcmf32_array = info[0].As<Napi::Float32Array>();
+    std::vector<float> pcmf32_new(pcmf32_array.Data(), pcmf32_array.Data() + pcmf32_array.ElementLength());
+
+    TranscriptionResult result = this->whisperStream_->process(pcmf32_new);
+
+    Napi::Object resultObj = Napi::Object::New(env);
+    resultObj.Set("text", Napi::String::New(env, result.text));
+    resultObj.Set("isFinal", Napi::Boolean::New(env, result.final));
+
+    return resultObj;
+}
+
+Napi::Value WhisperStreamWrapper::freeModel(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+    if (this->whisperStream_) {
+        delete this->whisperStream_;
+        this->whisperStream_ = nullptr;
+    }
+    return env.Undefined();
+}
+
+Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
+    return WhisperStreamWrapper::Init(env, exports);
+}
+
+NODE_API_MODULE(whisper, InitAll)
--- a/examples/stream.node/addon.h
+++ b/examples/stream.node/addon.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <napi.h>
+#include "whisper-stream.h"
+
+class WhisperStreamWrapper : public Napi::ObjectWrap<WhisperStreamWrapper> {
+public:
+    static Napi::Object Init(Napi::Env env, Napi::Object exports);
+    WhisperStreamWrapper(const Napi::CallbackInfo& info);
+
+private:
+    Napi::Value startModel(const Napi::CallbackInfo& info);
+    Napi::Value ProcessChunk(const Napi::CallbackInfo& info);
+    Napi::Value freeModel(const Napi::CallbackInfo& info);
+
+    WhisperStream* whisperStream_ = nullptr;
+};
--- a/examples/stream.node/index.js
+++ b/examples/stream.node/index.js
@ -0,0 +1,83 @@
+const path = require('path');
+const os = require('os');
+const portAudio = require('naudiodon2');
+
+const addonPath = path.join(__dirname, '..', '..', 'build', 'Release', 'stream.node');
+
+const { WhisperStream } = require(addonPath);
+
+const modelPath = path.join(__dirname, '..', '..', 'models', 'ggml-base.en.bin');
+const SAMPLE_RATE = 16000;
+
+// --- Main Application ---
+async function main() {
+  const whisper = new WhisperStream();
+  let pendingText = ''; // Buffer for the current unconfirmed text
+
+  console.log('Loading model...');
+  whisper.startModel({
+    modelPath: modelPath, 
+    language: 'en',
+    nThreads: 4,
+    stepMs: 3000,
+    lengthMs: 10000,
+    keepMs: 200,
+    useGpu: true, 
+  });
+  console.log('Model loaded.');
+
+  const ai = new portAudio.AudioIO({
+    inOptions: {
+      channelCount: 1,
+      sampleFormat: portAudio.SampleFormatFloat32,
+      sampleRate: SAMPLE_RATE,
+      deviceId: -1, 
+      closeOnError: true,
+    }
+  });
+
+  ai.on('data', (chunk) => {
+    const floatCount = chunk.length / Float32Array.BYTES_PER_ELEMENT;
+    const float32 = new Float32Array(chunk.buffer, chunk.byteOffset, floatCount);
+
+    try {
+      const result = whisper.processChunk(float32);
+      if (!result || !result.text) return;
+
+      const { text, isFinal } = result;
+
+      if (isFinal) {
+        process.stdout.write(`\r${text}\n`);
+        pendingText = ''; // Reset for the next utterance
+      } else {
+        pendingText = text;
+        // '\r' moves cursor to the start, '\x1B[K' clears the rest of the line.
+        process.stdout.write(`\r${pendingText}\x1B[K`);
+      }
+    } catch (err) {
+      console.error('Error during processing:', err);
+    }
+  });
+
+  ai.on('error', (err) => console.error('Audio input error:', err));
+
+  ai.start();
+  console.log('Recording from microphone. Speak now.');
+  process.stdout.write('> '); 
+
+  const shutdown = () => {
+    console.log('\nShutting down...');
+    ai.quit(() => {
+      whisper.freeModel();
+      process.exit(0);
+    });
+  };
+
+  process.on('SIGINT', shutdown);  
+  process.on('SIGTERM', shutdown); 
+}
+
+main().catch((err) => {
+  console.error('An unexpected error occurred:', err);
+  process.exit(1);
+});
--- a/examples/stream.node/package.json
+++ b/examples/stream.node/package.json
@ -0,0 +1,16 @@
+{
+  "name": "stream.node",
+  "version": "1.0.0",
+  "main": "index.js",
+  "scripts": {
+    "start": "node index_naudiodon.js",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "dependencies": {
+    "naudiodon2": "^2.3.6"
+  },
+  "devDependencies": {
+    "cmake-js": "^7.3.1",
+    "node-addon-api": "^5.1.0"
+  }
+}
--- a/examples/stream.node/whisper-stream.cpp
+++ b/examples/stream.node/whisper-stream.cpp
@ -0,0 +1,263 @@
+// whisper-stream.cpp 
+#include "whisper-stream.h"
+#include "common-whisper.h"
+#include "common.h"
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+
+WhisperStream::WhisperStream(const StreamParams &stream_params)
+    : params(stream_params) {}
+
+WhisperStream::~WhisperStream() {
+  if (ctx) {
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+    ctx = nullptr;
+  }
+  pcmf32_old.clear();
+  pcmf32_new.clear();
+  pcmf32.clear();
+  prompt_tokens.clear();
+}
+
+bool WhisperStream::init() {
+
+  // ensure keep/length constraints
+  params.keep_ms = std::min(params.keep_ms, params.step_ms);
+  params.length_ms = std::max(params.length_ms, params.step_ms);
+
+  // store sample counts as members (SAMPLES, not bytes)
+  n_samples_step = int((1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE);
+  n_samples_len = int((1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE);
+  n_samples_keep = int((1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE);
+  n_samples_30s = int((1e-3 * 30000.0) * WHISPER_SAMPLE_RATE);
+
+  use_vad = (n_samples_step <= 0);
+
+  n_new_line =
+      !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1;
+
+  params.no_timestamps = !use_vad;
+  params.no_context |= use_vad;
+
+  // language check
+  if (params.language != "auto" &&
+      whisper_lang_id(params.language.c_str()) == -1) {
+    fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+    throw std::runtime_error("unknown language");
+  }
+
+  struct whisper_context_params cparams = whisper_context_default_params();
+  cparams.use_gpu = params.use_gpu;
+  cparams.flash_attn = params.flash_attn;
+
+  // assign member ctx 
+  ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+  if (ctx == nullptr) {
+    fprintf(stderr, "error: failed to initialize whisper context\n");
+    throw std::runtime_error("failed to initialize whisper context");
+  }
+
+  // reserve buffers
+  pcmf32_new.clear();
+  pcmf32_new.reserve(n_samples_30s);
+  pcmf32.clear();
+  pcmf32_old.clear();
+  prompt_tokens.clear();
+
+  {
+    fprintf(stderr, "\n");
+    if (!whisper_is_multilingual(ctx)) {
+      if (params.language != "en" || params.translate) {
+        params.language = "en";
+        params.translate = false;
+        fprintf(stderr,
+                "%s: WARNING: model is not multilingual, ignoring language and "
+                "translation options\n",
+                __func__);
+      }
+    }
+    fprintf(
+        stderr,
+        "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = "
+        "%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+        __func__, n_samples_step, float(n_samples_step) / WHISPER_SAMPLE_RATE,
+        float(n_samples_len) / WHISPER_SAMPLE_RATE,
+        float(n_samples_keep) / WHISPER_SAMPLE_RATE, params.n_threads,
+        params.language.c_str(), params.translate ? "translate" : "transcribe",
+        params.no_timestamps ? 0 : 1);
+
+    if (!use_vad) {
+      fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__,
+              n_new_line, params.no_context);
+    } else {
+      fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n",
+              __func__);
+    }
+    fprintf(stderr, "\n");
+  }
+
+  n_iter = 0;
+  return true;
+}
+
+TranscriptionResult WhisperStream::process(const std::vector<float> &pcmf32_chunk) {
+
+  t_last = std::chrono::high_resolution_clock::now();
+  t_start = t_last;
+  // append incoming samples
+  pcmf32_new.insert(pcmf32_new.end(), pcmf32_chunk.begin(), pcmf32_chunk.end());
+
+  // Not VAD mode: require at least one step worth of samples
+  if (!use_vad) {
+    if ((int)pcmf32_new.size() < n_samples_step) {
+      return TranscriptionResult(); // not enough samples yet
+    }
+
+    const int n_samples_new = (int)pcmf32_new.size();
+
+    // take up to params.length_ms audio from previous iteration
+    const int n_samples_take =
+        std::min((int)pcmf32_old.size(),
+                 std::max(0, n_samples_keep + n_samples_len - n_samples_new));
+
+    pcmf32.resize(n_samples_new + n_samples_take);
+
+    // copy tail of old
+    for (int i = 0; i < n_samples_take; ++i) {
+      pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+    }
+
+    // copy new samples
+    memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(),
+           n_samples_new * sizeof(float));
+
+    // consume new buffer for next iteration
+    pcmf32_old = pcmf32;
+    pcmf32_new.clear();
+
+  } else {
+    const auto t_now = std::chrono::high_resolution_clock::now();
+    // VAD mode: require at least 2 seconds of audio (example); caller can tune
+    if ((int)pcmf32_new.size() < 2 * WHISPER_SAMPLE_RATE) {
+      return TranscriptionResult();
+    }
+
+    if (!::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold,
+                      params.freq_thold, false)) {
+      pcmf32_new.clear();
+      return TranscriptionResult(); // no speech detected
+    }
+
+    // take last length_ms worth of samples
+    const int take = std::min((int)pcmf32_new.size(), n_samples_len);
+    pcmf32.assign(pcmf32_new.end() - take, pcmf32_new.end());
+    pcmf32_new.clear();
+    t_last = t_now;
+  }
+
+  // run the inference
+  whisper_full_params wparams = whisper_full_default_params(
+      params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH
+                           : WHISPER_SAMPLING_GREEDY);
+
+  wparams.print_progress = false;
+  wparams.print_special = params.print_special;
+  wparams.print_realtime = false;
+  wparams.print_timestamps = !params.no_timestamps;
+  wparams.translate = params.translate;
+  wparams.single_segment = !use_vad;
+  wparams.max_tokens = params.max_tokens;
+  wparams.language = params.language.c_str();
+  wparams.n_threads = params.n_threads;
+  wparams.beam_search.beam_size = params.beam_size;
+
+  wparams.audio_ctx = params.audio_ctx;
+  wparams.tdrz_enable = params.tinydiarize;
+  wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
+
+  wparams.prompt_tokens =
+      params.no_context
+          ? nullptr
+          : (prompt_tokens.empty() ? nullptr : prompt_tokens.data());
+  wparams.prompt_n_tokens = params.no_context ? 0 : (int)prompt_tokens.size();
+
+  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+    fprintf(stderr, "%s: failed to process audio\n", __func__);
+    return TranscriptionResult();
+  }
+
+  // Build result as structured segments (we return a simple concatenated string
+  // here; you can change it to JSON or an array of structs for the JS wrapper)
+  std::string plain;
+  if (use_vad) {
+    const int64_t t1 = (t_last - t_start).count() / 1000000;
+    const int64_t t0 =
+        std::max(0.0, t1 - pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE);
+
+    plain += "\n";
+    plain += "### Transcription " + std::to_string(n_iter) +
+             " START | t0 = " + std::to_string(t0) +
+             " ms | t1 = " + std::to_string(t1) + " ms\n";
+    plain += "\n";
+  }
+
+  const int n_segments = whisper_full_n_segments(ctx);
+  for (int i = 0; i < n_segments; ++i) {
+    const char *text = whisper_full_get_segment_text(ctx, i);
+
+    if (params.no_timestamps) {
+      plain += text;
+    } else {
+      const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+      const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+      // append in safe steps to avoid operator precedence issues
+      plain += "[";
+      plain += to_timestamp(t0, false);
+      plain += " --> ";
+      plain += to_timestamp(t1, false);
+      plain += "]  ";
+      plain += text;
+      if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
+        plain += " [SPEAKER_TURN]";
+      }
+      plain += "\n";
+    }
+  }
+
+  if (use_vad) {
+    plain += "\n";
+    plain += "### Transcription n_iter END\n";
+  }
+
+  ++n_iter;
+
+  bool will_commit = false;
+  if (!use_vad && (n_iter % n_new_line) == 0) {
+    plain += "\n";
+    will_commit = true;
+    // guard slicing: ensure pcmf32 has enough samples
+    if ((int)pcmf32.size() >= n_samples_keep && n_samples_keep > 0) {
+      pcmf32_old.assign(pcmf32.end() - n_samples_keep, pcmf32.end());
+    } else {
+      pcmf32_old = pcmf32;
+    }
+
+    // update prompt tokens safely
+    if (!params.no_context) {
+      prompt_tokens.clear();
+      const int n_segments_after = whisper_full_n_segments(ctx);
+      for (int si = 0; si < n_segments_after; ++si) {
+        const int token_count = whisper_full_n_tokens(ctx, si);
+        for (int ti = 0; ti < token_count; ++ti) {
+          prompt_tokens.push_back(whisper_full_get_token_id(ctx, si, ti));
+        }
+      }
+    }
+  }
+
+  const bool is_final = use_vad || will_commit;
+
+  return TranscriptionResult{ plain, is_final};
+}
--- a/examples/stream.node/whisper-stream.h
+++ b/examples/stream.node/whisper-stream.h
@ -0,0 +1,69 @@
+// whisper-stream.h 
+#pragma once
+#include <chrono>
+#include <vector>
+#include <string>
+#include "whisper.h"
+#include <thread>
+
+struct StreamParams {
+    int n_threads = std::min(4, (int)std::thread::hardware_concurrency());
+    int step_ms = 3000;
+    int length_ms = 10000;
+    int keep_ms = 200;
+    int max_tokens = 32;
+    int audio_ctx = 0;
+    int beam_size = -1;
+    float vad_thold = 0.6f;
+    float freq_thold = 100.0f;
+    bool translate = false;
+    bool no_fallback = false;
+    bool print_special = false;
+    bool no_context = true;
+    bool no_timestamps = false;
+    bool tinydiarize = false;
+    bool save_audio = false;
+    bool use_gpu = true;
+    bool flash_attn = false;
+    std::string language = "en";
+    std::string model;
+};
+
+struct TranscriptionResult {
+    std::string text;
+    bool final;
+};
+
+class WhisperStream {
+public:
+    WhisperStream(const StreamParams &stream_params);
+    ~WhisperStream();
+
+    bool init();
+    TranscriptionResult process(const std::vector<float> &pcmf32_chunk);
+    void free(); // optional explicit free
+
+private:
+    StreamParams params;
+
+    // whisper context
+    struct whisper_context *ctx = nullptr;
+
+    // buffers (samples, not bytes)
+    std::vector<float> pcmf32;       // assembled input for inference
+    std::vector<float> pcmf32_new;   // appended incoming samples buffer
+    std::vector<float> pcmf32_old;   // overlap kept for next chunk
+
+    std::vector<whisper_token> prompt_tokens;
+
+    // sample counts and flags 
+    int n_samples_step = 0;
+    int n_samples_len  = 0;
+    int n_samples_keep = 0;
+    int n_samples_30s  = 0;
+    bool use_vad = false;
+    int n_new_line = 1;
+    int n_iter = 0;
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_last;
+};