diff --git a/examples/server/server.cpp b/examples/server/server.cpp index afc95176e..590378b72 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -315,10 +315,10 @@ std::string generate_temp_filename(const std::string &path, const std::string &p return ss.str(); } -bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) { +bool convert_to_wav(const std::string & temp_filename, std::string & error_resp, bool stereo) { std::ostringstream cmd_stream; std::string converted_filename_temp = temp_filename + "_temp.wav"; - cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; + cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac " << (stereo ? 2 : 1) << " -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; std::string cmd = cmd_stream.str(); int status = std::system(cmd.c_str()); @@ -341,7 +341,7 @@ bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) return true; } -std::string estimate_diarization_speaker(std::vector> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) { +std::string estimate_diarization_speaker(const std::vector> & pcmf32s, int64_t t0, int64_t t1, bool id_only = false) { std::string speaker = ""; const int64_t n_samples = pcmf32s[0].size(); @@ -451,7 +451,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper } } -std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector> pcmf32s) { +std::string output_str(struct whisper_context * ctx, const whisper_params & params, const std::vector> & pcmf32s) { std::stringstream result; const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { @@ -848,7 +848,7 @@ int main(int argc, char ** argv) { temp_file.close(); std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}"; - const bool is_converted = convert_to_wav(temp_filename, error_resp); + const bool is_converted = convert_to_wav(temp_filename, error_resp, params.diarize); if (!is_converted) { res.status = 500; res.set_content(error_resp, "application/json"); @@ -1091,6 +1091,14 @@ int main(int argc, char ** argv) { segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01; } + if (params.diarize && pcmf32s.size() == 2) { + segment["speaker"] = estimate_diarization_speaker( + pcmf32s, + whisper_full_get_segment_t0(ctx, i), + whisper_full_get_segment_t1(ctx, i), + true); + } + float total_logprob = 0; const int n_tokens = whisper_full_n_tokens(ctx, i); for (int j = 0; j < n_tokens; ++j) {