feat: example stream mode for nodejs addon
This commit is contained in:
parent
44fa2f647c
commit
5d3f20673b
|
|
@ -100,6 +100,7 @@ if (EMSCRIPTEN)
|
|||
add_subdirectory(bench.wasm)
|
||||
elseif(CMAKE_JS_VERSION)
|
||||
add_subdirectory(addon.node)
|
||||
add_subdirectory(stream.node)
|
||||
else()
|
||||
add_subdirectory(cli)
|
||||
add_subdirectory(bench)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,44 @@
|
|||
cmake_minimum_required(VERSION 3.13)
|
||||
project(stream_addon)
|
||||
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake ${CMAKE_MODULE_PATH})
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_definitions(-DNAPI_VERSION=4)
|
||||
include_directories(${CMAKE_JS_INC})
|
||||
|
||||
set(TARGET stream.node)
|
||||
|
||||
add_library(${TARGET} SHARED
|
||||
addon.cpp
|
||||
whisper-stream.cpp
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES
|
||||
PREFIX ""
|
||||
SUFFIX ".node"
|
||||
)
|
||||
include(DefaultTargetOptions)
|
||||
|
||||
execute_process(COMMAND node -p "require('node-addon-api').include"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE NODE_ADD_ON_API_DIR
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
string(REPLACE "\"" "" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
|
||||
string(REPLACE "\\\\" "/" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR})
|
||||
target_include_directories(${TARGET} PRIVATE ${NODE_ADD_ON_API_DIR})
|
||||
|
||||
target_link_libraries(${TARGET}
|
||||
whisper
|
||||
common
|
||||
${CMAKE_THREAD_LIBS_INIT}
|
||||
)
|
||||
|
||||
if(MSVC AND CMAKE_NODEJS_DEF AND CMAKE_JS_NODEJS_TARGET)
|
||||
target_link_libraries(${TARGET} ${CMAKE_JS_NODEJS_TARGET})
|
||||
elseif(CMAKE_JS_LIB)
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_LIB} ${CMAKE_NODEJS_LIB_TARGET})
|
||||
target_link_libraries(${TARGET} ${CMAKE_NODEJS_LIB_TARGET})
|
||||
endif()
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
#include "addon.h" // Your header file for WhisperStreamWrapper
|
||||
#include "whisper-stream.h" // Your header file for the WhisperStream class
|
||||
|
||||
// NOTE: The N-API wrapper handles errors by throwing JS exceptions, so this macro is not needed.
|
||||
// #define CHECK_STATUS(env, status, msg) ...
|
||||
|
||||
// --- Implementation of the Wrapper ---
|
||||
|
||||
Napi::Object WhisperStreamWrapper::Init(Napi::Env env, Napi::Object exports) {
|
||||
Napi::Function func = DefineClass(env, "WhisperStream", {
|
||||
InstanceMethod("startModel", &WhisperStreamWrapper::startModel),
|
||||
InstanceMethod("processChunk", &WhisperStreamWrapper::ProcessChunk),
|
||||
InstanceMethod("freeModel", &WhisperStreamWrapper::freeModel),
|
||||
});
|
||||
|
||||
exports.Set("WhisperStream", func);
|
||||
return exports;
|
||||
}
|
||||
|
||||
WhisperStreamWrapper::WhisperStreamWrapper(const Napi::CallbackInfo& info) : Napi::ObjectWrap<WhisperStreamWrapper>(info) {
|
||||
}
|
||||
|
||||
Napi::Value WhisperStreamWrapper::startModel(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() < 1 || !info[0].IsObject()) {
|
||||
Napi::TypeError::New(env, "Expected a configuration object").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
Napi::Object js_params = info[0].As<Napi::Object>();
|
||||
StreamParams params;
|
||||
|
||||
if (js_params.Has("modelPath")) {
|
||||
params.model = js_params.Get("modelPath").As<Napi::String>();
|
||||
} else {
|
||||
Napi::TypeError::New(env, "Missing required parameter 'model'").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
if (js_params.Has("language")) params.language = js_params.Get("language").As<Napi::String>();
|
||||
if (js_params.Has("nThreads")) params.n_threads = js_params.Get("nThreads").As<Napi::Number>();
|
||||
if (js_params.Has("stepMs")) params.step_ms = js_params.Get("stepMs").As<Napi::Number>();
|
||||
if (js_params.Has("lengthMs")) params.length_ms = js_params.Get("lengthMs").As<Napi::Number>();
|
||||
if (js_params.Has("keepMs")) params.keep_ms = js_params.Get("keepMs").As<Napi::Number>();
|
||||
if (js_params.Has("maxTokens")) params.max_tokens = js_params.Get("maxTokens").As<Napi::Number>();
|
||||
if (js_params.Has("audioCtx")) params.audio_ctx = js_params.Get("audioCtx").As<Napi::Number>();
|
||||
if (js_params.Has("vadThold")) params.vad_thold = js_params.Get("vadThold").As<Napi::Number>();
|
||||
if (js_params.Has("beamSize")) params.beam_size = js_params.Get("beamSize").As<Napi::Number>();
|
||||
if (js_params.Has("freqThold")) params.freq_thold = js_params.Get("freqThold").As<Napi::Number>();
|
||||
if (js_params.Has("translate")) params.translate = js_params.Get("translate").As<Napi::Boolean>();
|
||||
if (js_params.Has("noFallback")) params.no_fallback = js_params.Get("noFallback").As<Napi::Boolean>();
|
||||
if (js_params.Has("printSpecial")) params.print_special = js_params.Get("printSpecial").As<Napi::Boolean>();
|
||||
if (js_params.Has("noContext")) params.no_context = js_params.Get("noContext").As<Napi::Boolean>();
|
||||
if (js_params.Has("noTimestamps")) params.no_timestamps = js_params.Get("noTimestamps").As<Napi::Boolean>();
|
||||
if (js_params.Has("tinydiarize")) params.tinydiarize = js_params.Get("tinydiarize").As<Napi::Boolean>();
|
||||
if (js_params.Has("saveAudio")) params.save_audio = js_params.Get("saveAudio").As<Napi::Boolean>();
|
||||
if (js_params.Has("useGpu")) params.use_gpu = js_params.Get("useGpu").As<Napi::Boolean>();
|
||||
if (js_params.Has("flashAttn")) params.flash_attn = js_params.Get("flashAttn").As<Napi::Boolean>();
|
||||
|
||||
if (this->whisperStream_) {
|
||||
delete this->whisperStream_;
|
||||
}
|
||||
|
||||
try {
|
||||
this->whisperStream_ = new WhisperStream(params);
|
||||
this->whisperStream_->init();
|
||||
} catch (const std::runtime_error& e) {
|
||||
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
return env.Undefined();
|
||||
}
|
||||
|
||||
Napi::Value WhisperStreamWrapper::ProcessChunk(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (!this->whisperStream_) {
|
||||
Napi::Error::New(env, "Model not started. Call startModel() first.").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
if (info.Length() < 1 || !info[0].IsTypedArray() || info[0].As<Napi::TypedArray>().TypedArrayType() != napi_float32_array) {
|
||||
Napi::TypeError::New(env, "Argument must be a Float32Array").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
Napi::Float32Array pcmf32_array = info[0].As<Napi::Float32Array>();
|
||||
std::vector<float> pcmf32_new(pcmf32_array.Data(), pcmf32_array.Data() + pcmf32_array.ElementLength());
|
||||
|
||||
TranscriptionResult result = this->whisperStream_->process(pcmf32_new);
|
||||
|
||||
Napi::Object resultObj = Napi::Object::New(env);
|
||||
resultObj.Set("text", Napi::String::New(env, result.text));
|
||||
resultObj.Set("isFinal", Napi::Boolean::New(env, result.final));
|
||||
|
||||
return resultObj;
|
||||
}
|
||||
|
||||
Napi::Value WhisperStreamWrapper::freeModel(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (this->whisperStream_) {
|
||||
delete this->whisperStream_;
|
||||
this->whisperStream_ = nullptr;
|
||||
}
|
||||
return env.Undefined();
|
||||
}
|
||||
|
||||
Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
|
||||
return WhisperStreamWrapper::Init(env, exports);
|
||||
}
|
||||
|
||||
NODE_API_MODULE(whisper, InitAll)
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include <napi.h>
|
||||
#include "whisper-stream.h"
|
||||
|
||||
class WhisperStreamWrapper : public Napi::ObjectWrap<WhisperStreamWrapper> {
|
||||
public:
|
||||
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
||||
WhisperStreamWrapper(const Napi::CallbackInfo& info);
|
||||
|
||||
private:
|
||||
Napi::Value startModel(const Napi::CallbackInfo& info);
|
||||
Napi::Value ProcessChunk(const Napi::CallbackInfo& info);
|
||||
Napi::Value freeModel(const Napi::CallbackInfo& info);
|
||||
|
||||
WhisperStream* whisperStream_ = nullptr;
|
||||
};
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
const path = require('path');
|
||||
const os = require('os');
|
||||
const portAudio = require('naudiodon2');
|
||||
|
||||
const addonPath = path.join(__dirname, '..', '..', 'build', 'Release', 'stream.node');
|
||||
|
||||
const { WhisperStream } = require(addonPath);
|
||||
|
||||
const modelPath = path.join(__dirname, '..', '..', 'models', 'ggml-base.en.bin');
|
||||
const SAMPLE_RATE = 16000;
|
||||
|
||||
// --- Main Application ---
|
||||
async function main() {
|
||||
const whisper = new WhisperStream();
|
||||
let pendingText = ''; // Buffer for the current unconfirmed text
|
||||
|
||||
console.log('Loading model...');
|
||||
whisper.startModel({
|
||||
modelPath: modelPath,
|
||||
language: 'en',
|
||||
nThreads: 4,
|
||||
stepMs: 3000,
|
||||
lengthMs: 10000,
|
||||
keepMs: 200,
|
||||
useGpu: true,
|
||||
});
|
||||
console.log('Model loaded.');
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: SAMPLE_RATE,
|
||||
deviceId: -1,
|
||||
closeOnError: true,
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('data', (chunk) => {
|
||||
const floatCount = chunk.length / Float32Array.BYTES_PER_ELEMENT;
|
||||
const float32 = new Float32Array(chunk.buffer, chunk.byteOffset, floatCount);
|
||||
|
||||
try {
|
||||
const result = whisper.processChunk(float32);
|
||||
if (!result || !result.text) return;
|
||||
|
||||
const { text, isFinal } = result;
|
||||
|
||||
if (isFinal) {
|
||||
process.stdout.write(`\r${text}\n`);
|
||||
pendingText = ''; // Reset for the next utterance
|
||||
} else {
|
||||
pendingText = text;
|
||||
// '\r' moves cursor to the start, '\x1B[K' clears the rest of the line.
|
||||
process.stdout.write(`\r${pendingText}\x1B[K`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Error during processing:', err);
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('error', (err) => console.error('Audio input error:', err));
|
||||
|
||||
ai.start();
|
||||
console.log('Recording from microphone. Speak now.');
|
||||
process.stdout.write('> ');
|
||||
|
||||
const shutdown = () => {
|
||||
console.log('\nShutting down...');
|
||||
ai.quit(() => {
|
||||
whisper.freeModel();
|
||||
process.exit(0);
|
||||
});
|
||||
};
|
||||
|
||||
process.on('SIGINT', shutdown);
|
||||
process.on('SIGTERM', shutdown);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('An unexpected error occurred:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"name": "stream.node",
|
||||
"version": "1.0.0",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index_naudiodon.js",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"dependencies": {
|
||||
"naudiodon2": "^2.3.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"cmake-js": "^7.3.1",
|
||||
"node-addon-api": "^5.1.0"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
// whisper-stream.cpp
|
||||
#include "whisper-stream.h"
|
||||
#include "common-whisper.h"
|
||||
#include "common.h"
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
WhisperStream::WhisperStream(const StreamParams &stream_params)
|
||||
: params(stream_params) {}
|
||||
|
||||
WhisperStream::~WhisperStream() {
|
||||
if (ctx) {
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
ctx = nullptr;
|
||||
}
|
||||
pcmf32_old.clear();
|
||||
pcmf32_new.clear();
|
||||
pcmf32.clear();
|
||||
prompt_tokens.clear();
|
||||
}
|
||||
|
||||
bool WhisperStream::init() {
|
||||
|
||||
// ensure keep/length constraints
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms);
|
||||
params.length_ms = std::max(params.length_ms, params.step_ms);
|
||||
|
||||
// store sample counts as members (SAMPLES, not bytes)
|
||||
n_samples_step = int((1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE);
|
||||
n_samples_len = int((1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE);
|
||||
n_samples_keep = int((1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE);
|
||||
n_samples_30s = int((1e-3 * 30000.0) * WHISPER_SAMPLE_RATE);
|
||||
|
||||
use_vad = (n_samples_step <= 0);
|
||||
|
||||
n_new_line =
|
||||
!use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1;
|
||||
|
||||
params.no_timestamps = !use_vad;
|
||||
params.no_context |= use_vad;
|
||||
|
||||
// language check
|
||||
if (params.language != "auto" &&
|
||||
whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
throw std::runtime_error("unknown language");
|
||||
}
|
||||
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
|
||||
// assign member ctx
|
||||
ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||
throw std::runtime_error("failed to initialize whisper context");
|
||||
}
|
||||
|
||||
// reserve buffers
|
||||
pcmf32_new.clear();
|
||||
pcmf32_new.reserve(n_samples_30s);
|
||||
pcmf32.clear();
|
||||
pcmf32_old.clear();
|
||||
prompt_tokens.clear();
|
||||
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr,
|
||||
"%s: WARNING: model is not multilingual, ignoring language and "
|
||||
"translation options\n",
|
||||
__func__);
|
||||
}
|
||||
}
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = "
|
||||
"%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__, n_samples_step, float(n_samples_step) / WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_len) / WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_keep) / WHISPER_SAMPLE_RATE, params.n_threads,
|
||||
params.language.c_str(), params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
if (!use_vad) {
|
||||
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__,
|
||||
n_new_line, params.no_context);
|
||||
} else {
|
||||
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n",
|
||||
__func__);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
n_iter = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
TranscriptionResult WhisperStream::process(const std::vector<float> &pcmf32_chunk) {
|
||||
|
||||
t_last = std::chrono::high_resolution_clock::now();
|
||||
t_start = t_last;
|
||||
// append incoming samples
|
||||
pcmf32_new.insert(pcmf32_new.end(), pcmf32_chunk.begin(), pcmf32_chunk.end());
|
||||
|
||||
// Not VAD mode: require at least one step worth of samples
|
||||
if (!use_vad) {
|
||||
if ((int)pcmf32_new.size() < n_samples_step) {
|
||||
return TranscriptionResult(); // not enough samples yet
|
||||
}
|
||||
|
||||
const int n_samples_new = (int)pcmf32_new.size();
|
||||
|
||||
// take up to params.length_ms audio from previous iteration
|
||||
const int n_samples_take =
|
||||
std::min((int)pcmf32_old.size(),
|
||||
std::max(0, n_samples_keep + n_samples_len - n_samples_new));
|
||||
|
||||
pcmf32.resize(n_samples_new + n_samples_take);
|
||||
|
||||
// copy tail of old
|
||||
for (int i = 0; i < n_samples_take; ++i) {
|
||||
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
|
||||
}
|
||||
|
||||
// copy new samples
|
||||
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(),
|
||||
n_samples_new * sizeof(float));
|
||||
|
||||
// consume new buffer for next iteration
|
||||
pcmf32_old = pcmf32;
|
||||
pcmf32_new.clear();
|
||||
|
||||
} else {
|
||||
const auto t_now = std::chrono::high_resolution_clock::now();
|
||||
// VAD mode: require at least 2 seconds of audio (example); caller can tune
|
||||
if ((int)pcmf32_new.size() < 2 * WHISPER_SAMPLE_RATE) {
|
||||
return TranscriptionResult();
|
||||
}
|
||||
|
||||
if (!::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold,
|
||||
params.freq_thold, false)) {
|
||||
pcmf32_new.clear();
|
||||
return TranscriptionResult(); // no speech detected
|
||||
}
|
||||
|
||||
// take last length_ms worth of samples
|
||||
const int take = std::min((int)pcmf32_new.size(), n_samples_len);
|
||||
pcmf32.assign(pcmf32_new.end() - take, pcmf32_new.end());
|
||||
pcmf32_new.clear();
|
||||
t_last = t_now;
|
||||
}
|
||||
|
||||
// run the inference
|
||||
whisper_full_params wparams = whisper_full_default_params(
|
||||
params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH
|
||||
: WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.single_segment = !use_vad;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.tdrz_enable = params.tinydiarize;
|
||||
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
|
||||
|
||||
wparams.prompt_tokens =
|
||||
params.no_context
|
||||
? nullptr
|
||||
: (prompt_tokens.empty() ? nullptr : prompt_tokens.data());
|
||||
wparams.prompt_n_tokens = params.no_context ? 0 : (int)prompt_tokens.size();
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: failed to process audio\n", __func__);
|
||||
return TranscriptionResult();
|
||||
}
|
||||
|
||||
// Build result as structured segments (we return a simple concatenated string
|
||||
// here; you can change it to JSON or an array of structs for the JS wrapper)
|
||||
std::string plain;
|
||||
if (use_vad) {
|
||||
const int64_t t1 = (t_last - t_start).count() / 1000000;
|
||||
const int64_t t0 =
|
||||
std::max(0.0, t1 - pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE);
|
||||
|
||||
plain += "\n";
|
||||
plain += "### Transcription " + std::to_string(n_iter) +
|
||||
" START | t0 = " + std::to_string(t0) +
|
||||
" ms | t1 = " + std::to_string(t1) + " ms\n";
|
||||
plain += "\n";
|
||||
}
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char *text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
if (params.no_timestamps) {
|
||||
plain += text;
|
||||
} else {
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
// append in safe steps to avoid operator precedence issues
|
||||
plain += "[";
|
||||
plain += to_timestamp(t0, false);
|
||||
plain += " --> ";
|
||||
plain += to_timestamp(t1, false);
|
||||
plain += "] ";
|
||||
plain += text;
|
||||
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
||||
plain += " [SPEAKER_TURN]";
|
||||
}
|
||||
plain += "\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (use_vad) {
|
||||
plain += "\n";
|
||||
plain += "### Transcription n_iter END\n";
|
||||
}
|
||||
|
||||
++n_iter;
|
||||
|
||||
bool will_commit = false;
|
||||
if (!use_vad && (n_iter % n_new_line) == 0) {
|
||||
plain += "\n";
|
||||
will_commit = true;
|
||||
// guard slicing: ensure pcmf32 has enough samples
|
||||
if ((int)pcmf32.size() >= n_samples_keep && n_samples_keep > 0) {
|
||||
pcmf32_old.assign(pcmf32.end() - n_samples_keep, pcmf32.end());
|
||||
} else {
|
||||
pcmf32_old = pcmf32;
|
||||
}
|
||||
|
||||
// update prompt tokens safely
|
||||
if (!params.no_context) {
|
||||
prompt_tokens.clear();
|
||||
const int n_segments_after = whisper_full_n_segments(ctx);
|
||||
for (int si = 0; si < n_segments_after; ++si) {
|
||||
const int token_count = whisper_full_n_tokens(ctx, si);
|
||||
for (int ti = 0; ti < token_count; ++ti) {
|
||||
prompt_tokens.push_back(whisper_full_get_token_id(ctx, si, ti));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const bool is_final = use_vad || will_commit;
|
||||
|
||||
return TranscriptionResult{ plain, is_final};
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
// whisper-stream.h
|
||||
#pragma once
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "whisper.h"
|
||||
#include <thread>
|
||||
|
||||
struct StreamParams {
|
||||
int n_threads = std::min(4, (int)std::thread::hardware_concurrency());
|
||||
int step_ms = 3000;
|
||||
int length_ms = 10000;
|
||||
int keep_ms = 200;
|
||||
int max_tokens = 32;
|
||||
int audio_ctx = 0;
|
||||
int beam_size = -1;
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
bool translate = false;
|
||||
bool no_fallback = false;
|
||||
bool print_special = false;
|
||||
bool no_context = true;
|
||||
bool no_timestamps = false;
|
||||
bool tinydiarize = false;
|
||||
bool save_audio = false;
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = false;
|
||||
std::string language = "en";
|
||||
std::string model;
|
||||
};
|
||||
|
||||
struct TranscriptionResult {
|
||||
std::string text;
|
||||
bool final;
|
||||
};
|
||||
|
||||
class WhisperStream {
|
||||
public:
|
||||
WhisperStream(const StreamParams &stream_params);
|
||||
~WhisperStream();
|
||||
|
||||
bool init();
|
||||
TranscriptionResult process(const std::vector<float> &pcmf32_chunk);
|
||||
void free(); // optional explicit free
|
||||
|
||||
private:
|
||||
StreamParams params;
|
||||
|
||||
// whisper context
|
||||
struct whisper_context *ctx = nullptr;
|
||||
|
||||
// buffers (samples, not bytes)
|
||||
std::vector<float> pcmf32; // assembled input for inference
|
||||
std::vector<float> pcmf32_new; // appended incoming samples buffer
|
||||
std::vector<float> pcmf32_old; // overlap kept for next chunk
|
||||
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
// sample counts and flags
|
||||
int n_samples_step = 0;
|
||||
int n_samples_len = 0;
|
||||
int n_samples_keep = 0;
|
||||
int n_samples_30s = 0;
|
||||
bool use_vad = false;
|
||||
int n_new_line = 1;
|
||||
int n_iter = 0;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> t_last;
|
||||
};
|
||||
Loading…
Reference in New Issue