common : fix server /inference fails to decode in-memory audio (regression) (#3818)

* common: add memory buffer overload of read_audio_data whisper-server /inference without --convert passed the uploaded file bytes to read_audio_data as a filename, so ma_decoder_init_file tried to open a path starting with "RIFF" and failed. every request returned HTTP 400 "Invalid request" on builds without WHISPER_FFMPEG, which is the default. factor the PCM extraction into a shared helper and add an overload that decodes straight from a memory buffer via ma_decoder_init_memory, which the function already used for the stdin path. server now calls it with the upload content. the filename overload behavior is unchanged.
2026-05-22 08:27:35 +02:00 · 2026-05-22 08:27:35 +02:00 · 0ccd896f5b
parent 8443cf05e3
commit 0ccd896f5b
3 changed files with 57 additions and 33 deletions
--- a/examples/common-whisper.cpp
+++ b/examples/common-whisper.cpp
@ -39,6 +39,42 @@
 extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
 #endif

+// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split
+static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
+    ma_result result;
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+
+    if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
+        return false;
+    }
+
+    pcmf32.resize(stereo ? frame_count*2 : frame_count);
+
+    if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
+        return false;
+    }
+
+    if (stereo) {
+        std::vector<float> stereo_data = pcmf32;
+        pcmf32.resize(frame_count);
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
+        }
+        pcmf32s.resize(2);
+        pcmf32s[0].resize(frame_count);
+        pcmf32s[1].resize(frame_count);
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32s[0][i] = stereo_data[2*i];
+            pcmf32s[1][i] = stereo_data[2*i + 1];
+        }
+    }
+
+    return true;
+}
+
 bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output

@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
 #endif
    }

-    ma_uint64 frame_count;
-    ma_uint64 frames_read;
+    return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
+}

-    if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
-		fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
+// decode audio bytes already held in memory
+bool read_audio_data(const char * buffer, size_t buffer_size, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
+    ma_decoder decoder;

-		return false;
+    if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to decode audio data from memory buffer\n");
+        return false;
    }

-    pcmf32.resize(stereo ? frame_count*2 : frame_count);
-
-    if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
-		fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
-
-		return false;
-    }
-
-    if (stereo) {
-        std::vector<float> stereo_data = pcmf32;
-        pcmf32.resize(frame_count);
-
-        for (uint64_t i = 0; i < frame_count; i++) {
-            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
-        }
-
-        pcmf32s.resize(2);
-        pcmf32s[0].resize(frame_count);
-        pcmf32s[1].resize(frame_count);
-        for (uint64_t i = 0; i < frame_count; i++) {
-            pcmf32s[0][i] = stereo_data[2*i];
-            pcmf32s[1][i] = stereo_data[2*i + 1];
-        }
-    }
-
-    return true;
+    bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo);
+    ma_decoder_uninit(&decoder);
+    return ok;
 }

 //  500 -> 00:05.000
--- a/examples/common-whisper.h
+++ b/examples/common-whisper.h
@ -14,6 +14,14 @@ bool read_audio_data(
        std::vector<std::vector<float>> & pcmf32s,
        bool stereo);

+// decode audio bytes already held in memory (uploaded file, network buffer)
+bool read_audio_data(
+        const char * buffer,
+        size_t buffer_size,
+        std::vector<float> & pcmf32,
+        std::vector<std::vector<float>> & pcmf32s,
+        bool stereo);
+
 // convert timestamp to string, 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false);

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -868,8 +868,7 @@ int main(int argc, char ** argv) {
            // remove temp file
            std::remove(temp_filename.c_str());
        } else {
-            if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
-            {
+            if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) {
                fprintf(stderr, "error: failed to read audio data\n");
                const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
                res.status = 400;