server : Return speaker information in JSON (#3782)

2026-05-18 09:18:04 +02:00 · 2026-05-18 09:18:04 +02:00 · 6227a0ef73
parent 968eebe772
commit 6227a0ef73
1 changed files with 13 additions and 5 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -315,10 +315,10 @@ std::string generate_temp_filename(const std::string &path, const std::string &p
    return ss.str();
 }

-bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
+bool convert_to_wav(const std::string & temp_filename, std::string & error_resp, bool stereo) {
    std::ostringstream cmd_stream;
    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac " << (stereo ? 2 : 1) << " -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
    std::string cmd = cmd_stream.str();

    int status = std::system(cmd.c_str());
@ -341,7 +341,7 @@ bool convert_to_wav(const std::string & temp_filename, std::string & error_resp)
    return true;
 }

-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
+std::string estimate_diarization_speaker(const std::vector<std::vector<float>> & pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();

@ -451,7 +451,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
    }
 }

-std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+std::string output_str(struct whisper_context * ctx, const whisper_params & params, const std::vector<std::vector<float>> & pcmf32s) {
    std::stringstream result;
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
@ -848,7 +848,7 @@ int main(int argc, char ** argv) {
            temp_file.close();

            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
-            const bool is_converted = convert_to_wav(temp_filename, error_resp);
+            const bool is_converted = convert_to_wav(temp_filename, error_resp, params.diarize);
            if (!is_converted) {
                res.status = 500;
                res.set_content(error_resp, "application/json");
@ -1091,6 +1091,14 @@ int main(int argc, char ** argv) {
                    segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
                }

+                if (params.diarize && pcmf32s.size() == 2) {
+                    segment["speaker"] = estimate_diarization_speaker(
+                        pcmf32s,
+                        whisper_full_get_segment_t0(ctx, i),
+                        whisper_full_get_segment_t1(ctx, i),
+                        true);
+                }
+
                float total_logprob = 0;
                const int n_tokens = whisper_full_n_tokens(ctx, i);
                for (int j = 0; j < n_tokens; ++j) {