Add Silero VAD support, remove --vad-mode
This commit is contained in:
parent
5bbfb82f4a
commit
2757fd7665
|
|
@ -62,6 +62,8 @@ struct whisper_params {
|
|||
int32_t vad_probe_ms = 200;
|
||||
int32_t vad_silence_ms = 800;
|
||||
int32_t vad_pre_roll_ms = 300;
|
||||
|
||||
std::string vad_model = ""; // path to silero .bin model
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
|
@ -108,6 +110,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|||
else if (arg == "--vad-probe-ms") { params.vad_probe_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "--vad-silence-ms") { params.vad_silence_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "--vad-pre-roll-ms") { params.vad_pre_roll_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
|
||||
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
|
|
@ -154,6 +157,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||
fprintf(stderr, " --vad-probe-ms N [%-5d] VAD probe chunk size (ms)\n", params.vad_probe_ms);
|
||||
fprintf(stderr, " --vad-silence-ms N [%-5d] silence duration to end segment (ms)\n", params.vad_silence_ms);
|
||||
fprintf(stderr, " --vad-pre-roll-ms N[%-5d] audio prepended before VAD trigger (ms)\n", params.vad_pre_roll_ms);
|
||||
fprintf(stderr, " -vm PATH, --vad-model PATH [%-5s] path to Silero VAD model (enables silero VAD)\n", params.vad_model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
|
@ -519,6 +523,20 @@ int main(int argc, char ** argv) {
|
|||
return 2;
|
||||
}
|
||||
|
||||
// init Silero VAD context (if requested)
|
||||
struct whisper_vad_context * vad_ctx = nullptr;
|
||||
if (use_vad && !params.vad_model.empty()) {
|
||||
struct whisper_vad_context_params vparams = whisper_vad_default_context_params();
|
||||
vparams.n_threads = params.n_threads;
|
||||
vparams.use_gpu = false; // Silero VAD is tiny, always runs on CPU
|
||||
vad_ctx = whisper_vad_init_from_file_with_params(params.vad_model.c_str(), vparams);
|
||||
if (!vad_ctx) {
|
||||
fprintf(stderr, "error: failed to init VAD model '%s'\n", params.vad_model.c_str());
|
||||
whisper_free(ctx);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
|
||||
std::vector<float> pcmf32_old;
|
||||
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
|
||||
|
|
@ -549,7 +567,7 @@ int main(int argc, char ** argv) {
|
|||
if (!use_vad) {
|
||||
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
|
||||
} else {
|
||||
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
|
||||
fprintf(stderr, "%s: using VAD (%s), will transcribe on speech activity\n", __func__, vad_ctx ? "silero" : "simple");
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -806,8 +824,28 @@ int main(int argc, char ** argv) {
|
|||
debug_log("debug: save_audio wrote %zu samples (probe)\n", pcmf32_new.size());
|
||||
}
|
||||
|
||||
const bool silence = ::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, vad_last_ms, params.vad_thold, params.freq_thold, false);
|
||||
debug_log("debug: vad silence=%d (last_ms=%d)\n", silence ? 1 : 0, vad_last_ms);
|
||||
bool silence;
|
||||
if (vad_ctx) {
|
||||
// Silero VAD: run model on probe chunk, average probs
|
||||
if (whisper_vad_detect_speech(vad_ctx, pcmf32_new.data(), pcmf32_new.size())) {
|
||||
const int n_probs = whisper_vad_n_probs(vad_ctx);
|
||||
const float * probs = whisper_vad_probs(vad_ctx);
|
||||
float avg = 0.0f;
|
||||
for (int i = 0; i < n_probs; ++i) {
|
||||
avg += probs[i];
|
||||
}
|
||||
avg = (n_probs > 0) ? avg / n_probs : 0.0f;
|
||||
silence = (avg < params.vad_thold);
|
||||
debug_log("debug: silero vad avg=%.3f thold=%.2f silence=%d\n", avg, params.vad_thold, silence ? 1 : 0);
|
||||
} else {
|
||||
// detect_speech failed — treat as silence to avoid false triggers
|
||||
silence = true;
|
||||
debug_log("debug: silero vad detect_speech failed, treating as silence\n");
|
||||
}
|
||||
} else {
|
||||
silence = ::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, vad_last_ms, params.vad_thold, params.freq_thold, false);
|
||||
debug_log("debug: vad silence=%d (last_ms=%d)\n", silence ? 1 : 0, vad_last_ms);
|
||||
}
|
||||
|
||||
if (!in_speech) {
|
||||
if (!silence) {
|
||||
|
|
@ -855,5 +893,9 @@ int main(int argc, char ** argv) {
|
|||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
if (vad_ctx) {
|
||||
whisper_vad_free(vad_ctx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue