From 0ccd896f5b882628e1c077f9769735ef4ce52860 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 22 May 2026 08:27:35 +0200 Subject: [PATCH] common : fix server /inference fails to decode in-memory audio (regression) (#3818) * common: add memory buffer overload of read_audio_data whisper-server /inference without --convert passed the uploaded file bytes to read_audio_data as a filename, so ma_decoder_init_file tried to open a path starting with "RIFF" and failed. every request returned HTTP 400 "Invalid request" on builds without WHISPER_FFMPEG, which is the default. factor the PCM extraction into a shared helper and add an overload that decodes straight from a memory buffer via ma_decoder_init_memory, which the function already used for the stdin path. server now calls it with the upload content. the filename overload behavior is unchanged. --- examples/common-whisper.cpp | 79 ++++++++++++++++++++++--------------- examples/common-whisper.h | 8 ++++ examples/server/server.cpp | 3 +- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/examples/common-whisper.cpp b/examples/common-whisper.cpp index 977527a0c..d29166b50 100644 --- a/examples/common-whisper.cpp +++ b/examples/common-whisper.cpp @@ -39,6 +39,42 @@ extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data); #endif +// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split +static bool read_audio_from_decoder(ma_decoder & decoder, std::vector & pcmf32, std::vector> & pcmf32s, bool stereo) { + ma_result result; + ma_uint64 frame_count; + ma_uint64 frames_read; + + if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { + fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); + return false; + } + + pcmf32.resize(stereo ? frame_count*2 : frame_count); + + if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { + fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); + return false; + } + + if (stereo) { + std::vector stereo_data = pcmf32; + pcmf32.resize(frame_count); + for (uint64_t i = 0; i < frame_count; i++) { + pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]); + } + pcmf32s.resize(2); + pcmf32s[0].resize(frame_count); + pcmf32s[1].resize(frame_count); + for (uint64_t i = 0; i < frame_count; i++) { + pcmf32s[0][i] = stereo_data[2*i]; + pcmf32s[1][i] = stereo_data[2*i + 1]; + } + } + + return true; +} + bool read_audio_data(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { std::vector audio_data; // used for pipe input from stdin or ffmpeg decoding output @@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector& pcmf32, std: #endif } - ma_uint64 frame_count; - ma_uint64 frames_read; + return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo); +} - if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { - fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); +// decode audio bytes already held in memory +bool read_audio_data(const char * buffer, size_t buffer_size, std::vector & pcmf32, std::vector> & pcmf32s, bool stereo) { + ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE); + ma_decoder decoder; - return false; + if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) { + fprintf(stderr, "error: failed to decode audio data from memory buffer\n"); + return false; } - pcmf32.resize(stereo ? frame_count*2 : frame_count); - - if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { - fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); - - return false; - } - - if (stereo) { - std::vector stereo_data = pcmf32; - pcmf32.resize(frame_count); - - for (uint64_t i = 0; i < frame_count; i++) { - pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]); - } - - pcmf32s.resize(2); - pcmf32s[0].resize(frame_count); - pcmf32s[1].resize(frame_count); - for (uint64_t i = 0; i < frame_count; i++) { - pcmf32s[0][i] = stereo_data[2*i]; - pcmf32s[1][i] = stereo_data[2*i + 1]; - } - } - - return true; + bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo); + ma_decoder_uninit(&decoder); + return ok; } // 500 -> 00:05.000 diff --git a/examples/common-whisper.h b/examples/common-whisper.h index 413436215..8714c3810 100644 --- a/examples/common-whisper.h +++ b/examples/common-whisper.h @@ -14,6 +14,14 @@ bool read_audio_data( std::vector> & pcmf32s, bool stereo); +// decode audio bytes already held in memory (uploaded file, network buffer) +bool read_audio_data( + const char * buffer, + size_t buffer_size, + std::vector & pcmf32, + std::vector> & pcmf32s, + bool stereo); + // convert timestamp to string, 6000 -> 01:00.000 std::string to_timestamp(int64_t t, bool comma = false); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 590378b72..aae74c3d8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -868,8 +868,7 @@ int main(int argc, char ** argv) { // remove temp file std::remove(temp_filename.c_str()); } else { - if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize)) - { + if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) { fprintf(stderr, "error: failed to read audio data\n"); const std::string error_resp = "{\"error\":\"failed to read audio data\"}"; res.status = 400;