feat: example stream mode for nodejs addon

This commit is contained in:
Kevin 2025-09-27 17:19:58 +08:00
parent 44fa2f647c
commit 5d3f20673b
8 changed files with 607 additions and 0 deletions

View File

@ -100,6 +100,7 @@ if (EMSCRIPTEN)
add_subdirectory(bench.wasm)
elseif(CMAKE_JS_VERSION)
add_subdirectory(addon.node)
add_subdirectory(stream.node)
else()
add_subdirectory(cli)
add_subdirectory(bench)

View File

@ -0,0 +1,44 @@
cmake_minimum_required(VERSION 3.13)
project(stream_addon)
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake ${CMAKE_MODULE_PATH})
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_definitions(-DNAPI_VERSION=4)
include_directories(${CMAKE_JS_INC})
set(TARGET stream.node)
add_library(${TARGET} SHARED
addon.cpp
whisper-stream.cpp
)
set_target_properties(${TARGET} PROPERTIES
PREFIX ""
SUFFIX ".node"
)
include(DefaultTargetOptions)
execute_process(COMMAND node -p "require('node-addon-api').include"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE NODE_ADD_ON_API_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)
string(REPLACE "\"" "" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
string(REPLACE "\\\\" "/" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
target_include_directories(${TARGET} PRIVATE ${NODE_ADD_ON_API_DIR})
target_link_libraries(${TARGET}
whisper
common
${CMAKE_THREAD_LIBS_INIT}
)
if(MSVC AND CMAKE_NODEJS_DEF AND CMAKE_JS_NODEJS_TARGET)
target_link_libraries(${TARGET} ${CMAKE_JS_NODEJS_TARGET})
elseif(CMAKE_JS_LIB)
execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_LIB} ${CMAKE_NODEJS_LIB_TARGET})
target_link_libraries(${TARGET} ${CMAKE_NODEJS_LIB_TARGET})
endif()

View File

@ -0,0 +1,114 @@
#include "addon.h" // Your header file for WhisperStreamWrapper
#include "whisper-stream.h" // Your header file for the WhisperStream class
// NOTE: The N-API wrapper handles errors by throwing JS exceptions, so this macro is not needed.
// #define CHECK_STATUS(env, status, msg) ...
// --- Implementation of the Wrapper ---
Napi::Object WhisperStreamWrapper::Init(Napi::Env env, Napi::Object exports) {
Napi::Function func = DefineClass(env, "WhisperStream", {
InstanceMethod("startModel", &WhisperStreamWrapper::startModel),
InstanceMethod("processChunk", &WhisperStreamWrapper::ProcessChunk),
InstanceMethod("freeModel", &WhisperStreamWrapper::freeModel),
});
exports.Set("WhisperStream", func);
return exports;
}
WhisperStreamWrapper::WhisperStreamWrapper(const Napi::CallbackInfo& info) : Napi::ObjectWrap<WhisperStreamWrapper>(info) {
}
Napi::Value WhisperStreamWrapper::startModel(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() < 1 || !info[0].IsObject()) {
Napi::TypeError::New(env, "Expected a configuration object").ThrowAsJavaScriptException();
return env.Null();
}
Napi::Object js_params = info[0].As<Napi::Object>();
StreamParams params;
if (js_params.Has("modelPath")) {
params.model = js_params.Get("modelPath").As<Napi::String>();
} else {
Napi::TypeError::New(env, "Missing required parameter 'model'").ThrowAsJavaScriptException();
return env.Null();
}
if (js_params.Has("language")) params.language = js_params.Get("language").As<Napi::String>();
if (js_params.Has("nThreads")) params.n_threads = js_params.Get("nThreads").As<Napi::Number>();
if (js_params.Has("stepMs")) params.step_ms = js_params.Get("stepMs").As<Napi::Number>();
if (js_params.Has("lengthMs")) params.length_ms = js_params.Get("lengthMs").As<Napi::Number>();
if (js_params.Has("keepMs")) params.keep_ms = js_params.Get("keepMs").As<Napi::Number>();
if (js_params.Has("maxTokens")) params.max_tokens = js_params.Get("maxTokens").As<Napi::Number>();
if (js_params.Has("audioCtx")) params.audio_ctx = js_params.Get("audioCtx").As<Napi::Number>();
if (js_params.Has("vadThold")) params.vad_thold = js_params.Get("vadThold").As<Napi::Number>();
if (js_params.Has("beamSize")) params.beam_size = js_params.Get("beamSize").As<Napi::Number>();
if (js_params.Has("freqThold")) params.freq_thold = js_params.Get("freqThold").As<Napi::Number>();
if (js_params.Has("translate")) params.translate = js_params.Get("translate").As<Napi::Boolean>();
if (js_params.Has("noFallback")) params.no_fallback = js_params.Get("noFallback").As<Napi::Boolean>();
if (js_params.Has("printSpecial")) params.print_special = js_params.Get("printSpecial").As<Napi::Boolean>();
if (js_params.Has("noContext")) params.no_context = js_params.Get("noContext").As<Napi::Boolean>();
if (js_params.Has("noTimestamps")) params.no_timestamps = js_params.Get("noTimestamps").As<Napi::Boolean>();
if (js_params.Has("tinydiarize")) params.tinydiarize = js_params.Get("tinydiarize").As<Napi::Boolean>();
if (js_params.Has("saveAudio")) params.save_audio = js_params.Get("saveAudio").As<Napi::Boolean>();
if (js_params.Has("useGpu")) params.use_gpu = js_params.Get("useGpu").As<Napi::Boolean>();
if (js_params.Has("flashAttn")) params.flash_attn = js_params.Get("flashAttn").As<Napi::Boolean>();
if (this->whisperStream_) {
delete this->whisperStream_;
}
try {
this->whisperStream_ = new WhisperStream(params);
this->whisperStream_->init();
} catch (const std::runtime_error& e) {
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
return env.Null();
}
return env.Undefined();
}
Napi::Value WhisperStreamWrapper::ProcessChunk(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (!this->whisperStream_) {
Napi::Error::New(env, "Model not started. Call startModel() first.").ThrowAsJavaScriptException();
return env.Null();
}
if (info.Length() < 1 || !info[0].IsTypedArray() || info[0].As<Napi::TypedArray>().TypedArrayType() != napi_float32_array) {
Napi::TypeError::New(env, "Argument must be a Float32Array").ThrowAsJavaScriptException();
return env.Null();
}
Napi::Float32Array pcmf32_array = info[0].As<Napi::Float32Array>();
std::vector<float> pcmf32_new(pcmf32_array.Data(), pcmf32_array.Data() + pcmf32_array.ElementLength());
TranscriptionResult result = this->whisperStream_->process(pcmf32_new);
Napi::Object resultObj = Napi::Object::New(env);
resultObj.Set("text", Napi::String::New(env, result.text));
resultObj.Set("isFinal", Napi::Boolean::New(env, result.final));
return resultObj;
}
Napi::Value WhisperStreamWrapper::freeModel(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (this->whisperStream_) {
delete this->whisperStream_;
this->whisperStream_ = nullptr;
}
return env.Undefined();
}
Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
return WhisperStreamWrapper::Init(env, exports);
}
NODE_API_MODULE(whisper, InitAll)

View File

@ -0,0 +1,17 @@
#pragma once
#include <napi.h>
#include "whisper-stream.h"
class WhisperStreamWrapper : public Napi::ObjectWrap<WhisperStreamWrapper> {
public:
static Napi::Object Init(Napi::Env env, Napi::Object exports);
WhisperStreamWrapper(const Napi::CallbackInfo& info);
private:
Napi::Value startModel(const Napi::CallbackInfo& info);
Napi::Value ProcessChunk(const Napi::CallbackInfo& info);
Napi::Value freeModel(const Napi::CallbackInfo& info);
WhisperStream* whisperStream_ = nullptr;
};

View File

@ -0,0 +1,83 @@
const path = require('path');
const os = require('os');
const portAudio = require('naudiodon2');
const addonPath = path.join(__dirname, '..', '..', 'build', 'Release', 'stream.node');
const { WhisperStream } = require(addonPath);
const modelPath = path.join(__dirname, '..', '..', 'models', 'ggml-base.en.bin');
const SAMPLE_RATE = 16000;
// --- Main Application ---
async function main() {
const whisper = new WhisperStream();
let pendingText = ''; // Buffer for the current unconfirmed text
console.log('Loading model...');
whisper.startModel({
modelPath: modelPath,
language: 'en',
nThreads: 4,
stepMs: 3000,
lengthMs: 10000,
keepMs: 200,
useGpu: true,
});
console.log('Model loaded.');
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: SAMPLE_RATE,
deviceId: -1,
closeOnError: true,
}
});
ai.on('data', (chunk) => {
const floatCount = chunk.length / Float32Array.BYTES_PER_ELEMENT;
const float32 = new Float32Array(chunk.buffer, chunk.byteOffset, floatCount);
try {
const result = whisper.processChunk(float32);
if (!result || !result.text) return;
const { text, isFinal } = result;
if (isFinal) {
process.stdout.write(`\r${text}\n`);
pendingText = ''; // Reset for the next utterance
} else {
pendingText = text;
// '\r' moves cursor to the start, '\x1B[K' clears the rest of the line.
process.stdout.write(`\r${pendingText}\x1B[K`);
}
} catch (err) {
console.error('Error during processing:', err);
}
});
ai.on('error', (err) => console.error('Audio input error:', err));
ai.start();
console.log('Recording from microphone. Speak now.');
process.stdout.write('> ');
const shutdown = () => {
console.log('\nShutting down...');
ai.quit(() => {
whisper.freeModel();
process.exit(0);
});
};
process.on('SIGINT', shutdown);
process.on('SIGTERM', shutdown);
}
main().catch((err) => {
console.error('An unexpected error occurred:', err);
process.exit(1);
});

View File

@ -0,0 +1,16 @@
{
"name": "stream.node",
"version": "1.0.0",
"main": "index.js",
"scripts": {
"start": "node index_naudiodon.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"dependencies": {
"naudiodon2": "^2.3.6"
},
"devDependencies": {
"cmake-js": "^7.3.1",
"node-addon-api": "^5.1.0"
}
}

View File

@ -0,0 +1,263 @@
// whisper-stream.cpp
#include "whisper-stream.h"
#include "common-whisper.h"
#include "common.h"
#include <algorithm>
#include <cstdio>
#include <cstring>
WhisperStream::WhisperStream(const StreamParams &stream_params)
: params(stream_params) {}
WhisperStream::~WhisperStream() {
if (ctx) {
whisper_print_timings(ctx);
whisper_free(ctx);
ctx = nullptr;
}
pcmf32_old.clear();
pcmf32_new.clear();
pcmf32.clear();
prompt_tokens.clear();
}
bool WhisperStream::init() {
// ensure keep/length constraints
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
// store sample counts as members (SAMPLES, not bytes)
n_samples_step = int((1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE);
n_samples_len = int((1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE);
n_samples_keep = int((1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE);
n_samples_30s = int((1e-3 * 30000.0) * WHISPER_SAMPLE_RATE);
use_vad = (n_samples_step <= 0);
n_new_line =
!use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1;
params.no_timestamps = !use_vad;
params.no_context |= use_vad;
// language check
if (params.language != "auto" &&
whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
throw std::runtime_error("unknown language");
}
struct whisper_context_params cparams = whisper_context_default_params();
cparams.use_gpu = params.use_gpu;
cparams.flash_attn = params.flash_attn;
// assign member ctx
ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
throw std::runtime_error("failed to initialize whisper context");
}
// reserve buffers
pcmf32_new.clear();
pcmf32_new.reserve(n_samples_30s);
pcmf32.clear();
pcmf32_old.clear();
prompt_tokens.clear();
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr,
"%s: WARNING: model is not multilingual, ignoring language and "
"translation options\n",
__func__);
}
}
fprintf(
stderr,
"%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = "
"%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__, n_samples_step, float(n_samples_step) / WHISPER_SAMPLE_RATE,
float(n_samples_len) / WHISPER_SAMPLE_RATE,
float(n_samples_keep) / WHISPER_SAMPLE_RATE, params.n_threads,
params.language.c_str(), params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
if (!use_vad) {
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__,
n_new_line, params.no_context);
} else {
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n",
__func__);
}
fprintf(stderr, "\n");
}
n_iter = 0;
return true;
}
TranscriptionResult WhisperStream::process(const std::vector<float> &pcmf32_chunk) {
t_last = std::chrono::high_resolution_clock::now();
t_start = t_last;
// append incoming samples
pcmf32_new.insert(pcmf32_new.end(), pcmf32_chunk.begin(), pcmf32_chunk.end());
// Not VAD mode: require at least one step worth of samples
if (!use_vad) {
if ((int)pcmf32_new.size() < n_samples_step) {
return TranscriptionResult(); // not enough samples yet
}
const int n_samples_new = (int)pcmf32_new.size();
// take up to params.length_ms audio from previous iteration
const int n_samples_take =
std::min((int)pcmf32_old.size(),
std::max(0, n_samples_keep + n_samples_len - n_samples_new));
pcmf32.resize(n_samples_new + n_samples_take);
// copy tail of old
for (int i = 0; i < n_samples_take; ++i) {
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
}
// copy new samples
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(),
n_samples_new * sizeof(float));
// consume new buffer for next iteration
pcmf32_old = pcmf32;
pcmf32_new.clear();
} else {
const auto t_now = std::chrono::high_resolution_clock::now();
// VAD mode: require at least 2 seconds of audio (example); caller can tune
if ((int)pcmf32_new.size() < 2 * WHISPER_SAMPLE_RATE) {
return TranscriptionResult();
}
if (!::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold,
params.freq_thold, false)) {
pcmf32_new.clear();
return TranscriptionResult(); // no speech detected
}
// take last length_ms worth of samples
const int take = std::min((int)pcmf32_new.size(), n_samples_len);
pcmf32.assign(pcmf32_new.end() - take, pcmf32_new.end());
pcmf32_new.clear();
t_last = t_now;
}
// run the inference
whisper_full_params wparams = whisper_full_default_params(
params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH
: WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.single_segment = !use_vad;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.beam_search.beam_size = params.beam_size;
wparams.audio_ctx = params.audio_ctx;
wparams.tdrz_enable = params.tinydiarize;
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.prompt_tokens =
params.no_context
? nullptr
: (prompt_tokens.empty() ? nullptr : prompt_tokens.data());
wparams.prompt_n_tokens = params.no_context ? 0 : (int)prompt_tokens.size();
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", __func__);
return TranscriptionResult();
}
// Build result as structured segments (we return a simple concatenated string
// here; you can change it to JSON or an array of structs for the JS wrapper)
std::string plain;
if (use_vad) {
const int64_t t1 = (t_last - t_start).count() / 1000000;
const int64_t t0 =
std::max(0.0, t1 - pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE);
plain += "\n";
plain += "### Transcription " + std::to_string(n_iter) +
" START | t0 = " + std::to_string(t0) +
" ms | t1 = " + std::to_string(t1) + " ms\n";
plain += "\n";
}
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char *text = whisper_full_get_segment_text(ctx, i);
if (params.no_timestamps) {
plain += text;
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
// append in safe steps to avoid operator precedence issues
plain += "[";
plain += to_timestamp(t0, false);
plain += " --> ";
plain += to_timestamp(t1, false);
plain += "] ";
plain += text;
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
plain += " [SPEAKER_TURN]";
}
plain += "\n";
}
}
if (use_vad) {
plain += "\n";
plain += "### Transcription n_iter END\n";
}
++n_iter;
bool will_commit = false;
if (!use_vad && (n_iter % n_new_line) == 0) {
plain += "\n";
will_commit = true;
// guard slicing: ensure pcmf32 has enough samples
if ((int)pcmf32.size() >= n_samples_keep && n_samples_keep > 0) {
pcmf32_old.assign(pcmf32.end() - n_samples_keep, pcmf32.end());
} else {
pcmf32_old = pcmf32;
}
// update prompt tokens safely
if (!params.no_context) {
prompt_tokens.clear();
const int n_segments_after = whisper_full_n_segments(ctx);
for (int si = 0; si < n_segments_after; ++si) {
const int token_count = whisper_full_n_tokens(ctx, si);
for (int ti = 0; ti < token_count; ++ti) {
prompt_tokens.push_back(whisper_full_get_token_id(ctx, si, ti));
}
}
}
}
const bool is_final = use_vad || will_commit;
return TranscriptionResult{ plain, is_final};
}

View File

@ -0,0 +1,69 @@
// whisper-stream.h
#pragma once
#include <chrono>
#include <vector>
#include <string>
#include "whisper.h"
#include <thread>
struct StreamParams {
int n_threads = std::min(4, (int)std::thread::hardware_concurrency());
int step_ms = 3000;
int length_ms = 10000;
int keep_ms = 200;
int max_tokens = 32;
int audio_ctx = 0;
int beam_size = -1;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool translate = false;
bool no_fallback = false;
bool print_special = false;
bool no_context = true;
bool no_timestamps = false;
bool tinydiarize = false;
bool save_audio = false;
bool use_gpu = true;
bool flash_attn = false;
std::string language = "en";
std::string model;
};
struct TranscriptionResult {
std::string text;
bool final;
};
class WhisperStream {
public:
WhisperStream(const StreamParams &stream_params);
~WhisperStream();
bool init();
TranscriptionResult process(const std::vector<float> &pcmf32_chunk);
void free(); // optional explicit free
private:
StreamParams params;
// whisper context
struct whisper_context *ctx = nullptr;
// buffers (samples, not bytes)
std::vector<float> pcmf32; // assembled input for inference
std::vector<float> pcmf32_new; // appended incoming samples buffer
std::vector<float> pcmf32_old; // overlap kept for next chunk
std::vector<whisper_token> prompt_tokens;
// sample counts and flags
int n_samples_step = 0;
int n_samples_len = 0;
int n_samples_keep = 0;
int n_samples_30s = 0;
bool use_vad = false;
int n_new_line = 1;
int n_iter = 0;
std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
std::chrono::time_point<std::chrono::high_resolution_clock> t_last;
};