common : fix server /inference fails to decode in-memory audio (regression) (#3818)
* common: add memory buffer overload of read_audio_data whisper-server /inference without --convert passed the uploaded file bytes to read_audio_data as a filename, so ma_decoder_init_file tried to open a path starting with "RIFF" and failed. every request returned HTTP 400 "Invalid request" on builds without WHISPER_FFMPEG, which is the default. factor the PCM extraction into a shared helper and add an overload that decodes straight from a memory buffer via ma_decoder_init_memory, which the function already used for the stdin path. server now calls it with the upload content. the filename overload behavior is unchanged.
This commit is contained in:
parent
8443cf05e3
commit
0ccd896f5b
|
|
@ -39,6 +39,42 @@
|
|||
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
|
||||
#endif
|
||||
|
||||
// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split
|
||||
static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
|
||||
ma_result result;
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
|
||||
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32.resize(stereo ? frame_count*2 : frame_count);
|
||||
|
||||
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stereo) {
|
||||
std::vector<float> stereo_data = pcmf32;
|
||||
pcmf32.resize(frame_count);
|
||||
for (uint64_t i = 0; i < frame_count; i++) {
|
||||
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
|
||||
}
|
||||
pcmf32s.resize(2);
|
||||
pcmf32s[0].resize(frame_count);
|
||||
pcmf32s[1].resize(frame_count);
|
||||
for (uint64_t i = 0; i < frame_count; i++) {
|
||||
pcmf32s[0][i] = stereo_data[2*i];
|
||||
pcmf32s[1][i] = stereo_data[2*i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
||||
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
|
||||
|
||||
|
|
@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
|
|||
#endif
|
||||
}
|
||||
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
|
||||
}
|
||||
|
||||
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
|
||||
// decode audio bytes already held in memory
|
||||
bool read_audio_data(const char * buffer, size_t buffer_size, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
|
||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
|
||||
ma_decoder decoder;
|
||||
|
||||
return false;
|
||||
if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to decode audio data from memory buffer\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32.resize(stereo ? frame_count*2 : frame_count);
|
||||
|
||||
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stereo) {
|
||||
std::vector<float> stereo_data = pcmf32;
|
||||
pcmf32.resize(frame_count);
|
||||
|
||||
for (uint64_t i = 0; i < frame_count; i++) {
|
||||
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
|
||||
}
|
||||
|
||||
pcmf32s.resize(2);
|
||||
pcmf32s[0].resize(frame_count);
|
||||
pcmf32s[1].resize(frame_count);
|
||||
for (uint64_t i = 0; i < frame_count; i++) {
|
||||
pcmf32s[0][i] = stereo_data[2*i];
|
||||
pcmf32s[1][i] = stereo_data[2*i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo);
|
||||
ma_decoder_uninit(&decoder);
|
||||
return ok;
|
||||
}
|
||||
|
||||
// 500 -> 00:05.000
|
||||
|
|
|
|||
|
|
@ -14,6 +14,14 @@ bool read_audio_data(
|
|||
std::vector<std::vector<float>> & pcmf32s,
|
||||
bool stereo);
|
||||
|
||||
// decode audio bytes already held in memory (uploaded file, network buffer)
|
||||
bool read_audio_data(
|
||||
const char * buffer,
|
||||
size_t buffer_size,
|
||||
std::vector<float> & pcmf32,
|
||||
std::vector<std::vector<float>> & pcmf32s,
|
||||
bool stereo);
|
||||
|
||||
// convert timestamp to string, 6000 -> 01:00.000
|
||||
std::string to_timestamp(int64_t t, bool comma = false);
|
||||
|
||||
|
|
|
|||
|
|
@ -868,8 +868,7 @@ int main(int argc, char ** argv) {
|
|||
// remove temp file
|
||||
std::remove(temp_filename.c_str());
|
||||
} else {
|
||||
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
|
||||
{
|
||||
if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read audio data\n");
|
||||
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
|
||||
res.status = 400;
|
||||
|
|
|
|||
Loading…
Reference in New Issue