server : Return speaker information in JSON (#3782)

This commit is contained in:
Andreas Lubbe 2026-05-18 09:18:04 +02:00 committed by GitHub
parent 968eebe772
commit 6227a0ef73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 13 additions and 5 deletions

View File

@ -315,10 +315,10 @@ std::string generate_temp_filename(const std::string &path, const std::string &p
return ss.str();
}
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp, bool stereo) {
std::ostringstream cmd_stream;
std::string converted_filename_temp = temp_filename + "_temp.wav";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac " << (stereo ? 2 : 1) << " -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
std::string cmd = cmd_stream.str();
int status = std::system(cmd.c_str());
@ -341,7 +341,7 @@ bool convert_to_wav(const std::string & temp_filename, std::string & error_resp)
return true;
}
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string estimate_diarization_speaker(const std::vector<std::vector<float>> & pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string speaker = "";
const int64_t n_samples = pcmf32s[0].size();
@ -451,7 +451,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
}
}
std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::string output_str(struct whisper_context * ctx, const whisper_params & params, const std::vector<std::vector<float>> & pcmf32s) {
std::stringstream result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
@ -848,7 +848,7 @@ int main(int argc, char ** argv) {
temp_file.close();
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
const bool is_converted = convert_to_wav(temp_filename, error_resp);
const bool is_converted = convert_to_wav(temp_filename, error_resp, params.diarize);
if (!is_converted) {
res.status = 500;
res.set_content(error_resp, "application/json");
@ -1091,6 +1091,14 @@ int main(int argc, char ** argv) {
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
}
if (params.diarize && pcmf32s.size() == 2) {
segment["speaker"] = estimate_diarization_speaker(
pcmf32s,
whisper_full_get_segment_t0(ctx, i),
whisper_full_get_segment_t1(ctx, i),
true);
}
float total_logprob = 0;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {