feat(cli): allow ffmpeg decoding from stdin

This commit is contained in:
Marco Manino 2026-02-02 15:44:59 +01:00
parent 47af2fb70f
commit 8a73e99137
No known key found for this signature in database
GPG Key ID: 75EF7986F8678F74
2 changed files with 35 additions and 26 deletions

View File

@ -35,8 +35,9 @@
#include <fstream>
#ifdef WHISPER_FFMPEG
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
// as implemented in ffmpeg-transcode.cpp only embedded in common lib if whisper built with ffmpeg support
extern bool ffmpeg_decode_audio(uint8_t * idata, size_t isize, std::vector<uint8_t> & wav_data);
extern bool ffmpeg_decode_audio_file(const std::string & ifname, std::vector<uint8_t> & wav_data);
#endif
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
@ -62,7 +63,13 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
}
audio_data.insert(audio_data.end(), buf, buf + n);
}
#if defined(WHISPER_FFMPEG)
if (ffmpeg_decode_audio(audio_data.data(), audio_data.size(), audio_data) != 0) {
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
return false;
}
#endif
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));
@ -74,7 +81,7 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
}
else if (((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS)) {
#if defined(WHISPER_FFMPEG)
if (ffmpeg_decode_audio(fname, audio_data) != 0) {
if (ffmpeg_decode_audio_file(fname, audio_data) != 0) {
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
return false;

View File

@ -187,9 +187,13 @@ static bool is_audio_stream(const AVStream *stream)
// audio_buffer: input memory
// data: decoded output audio data (wav file)
// size: size of output data
static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
int ffmpeg_decode_audio(u8 *idata, size_t isize, std::vector<uint8_t> &wav_data)
{
LOG("decode_audio: input size: %d\n", audio_buf->size);
struct audio_buffer inaudio_buf;
inaudio_buf.ptr = idata;
inaudio_buf.size = isize;
LOG("ffmpeg_decode_audio: input size: %d\n", inaudio_buf.size);
AVFormatContext *fmt_ctx;
AVIOContext *avio_ctx;
AVStream *stream;
@ -207,7 +211,7 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
fmt_ctx = avformat_alloc_context();
avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL);
avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, &inaudio_buf, &read_packet, NULL, NULL);
fmt_ctx->pb = avio_ctx;
// open the input stream and read header
@ -291,8 +295,8 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
}
/* iterate through frames */
*data = NULL;
*size = 0;
s16 *odata = NULL;
int osize = 0;
while (av_read_frame(fmt_ctx, packet) >= 0) {
avcodec_send_packet(codec, packet);
@ -300,10 +304,10 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
if (err == AVERROR(EAGAIN))
continue;
convert_frame(swr, codec, frame, data, size, false);
convert_frame(swr, codec, frame, &odata, &osize, false);
}
/* Flush any remaining conversion buffers... */
convert_frame(swr, codec, frame, data, size, true);
convert_frame(swr, codec, frame, &odata, &osize, true);
av_packet_free(&packet);
av_frame_free(&frame);
@ -318,6 +322,17 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
av_freep(&avio_ctx);
}
wave_hdr wh;
const size_t outdatasize = osize * sizeof(s16);
set_wave_hdr(wh, outdatasize);
wav_data.resize(sizeof(wave_hdr) + outdatasize);
// header:
memcpy(wav_data.data(), &wh, sizeof(wave_hdr));
// the data:
memcpy(wav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
free(odata);
return 0;
}
@ -325,8 +340,8 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
// ifname: input file path
// owav_data: in mem wav file. Can be forwarded as it to whisper/drwav
// return 0 on success
int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_data) {
LOG("ffmpeg_decode_audio: %s\n", ifname.c_str());
int ffmpeg_decode_audio_file(const std::string &ifname, std::vector<uint8_t>& owav_data) {
LOG("ffmpeg_decode_audio_file: %s\n", ifname.c_str());
int ifd = open(ifname.c_str(), O_RDONLY);
if (ifd == -1) {
fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str());
@ -340,14 +355,10 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
return err;
}
LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
struct audio_buffer inaudio_buf;
inaudio_buf.ptr = ibuf;
inaudio_buf.size = ibuf_size;
s16 *odata=NULL;
int osize=0;
err = decode_audio(&inaudio_buf, &odata, &osize);
err = ffmpeg_decode_audio(ibuf, ibuf_size, owav_data);
LOG("decode_audio returned %d \n", err);
if (err != 0) {
LOG("decode_audio failed\n");
@ -355,14 +366,5 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
}
LOG("decode_audio output size: %d\n", osize);
wave_hdr wh;
const size_t outdatasize = osize * sizeof(s16);
set_wave_hdr(wh, outdatasize);
owav_data.resize(sizeof(wave_hdr) + outdatasize);
// header:
memcpy(owav_data.data(), &wh, sizeof(wave_hdr));
// the data:
memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
return 0;
}