From f39cc7128295ff5c67bbedb73161bed549f96e96 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 31 May 2026 15:44:07 +0300 Subject: [PATCH] common : re-implement `ffmpeg-transcode.cpp` + clarify ffmpeg usage (#3846) * examples : remove ffmpeg-transcode.cpp * examples : implement ffmpeg-transcode.cpp Assisted-by: llama.cpp:local pi * common : switch from WHISPER_FFMPEG -> WHISPER_COMMON_FFMPEG --- CMakeLists.txt | 3 +- README.md | 7 +- examples/CMakeLists.txt | 4 +- examples/common-whisper.cpp | 80 +++-- examples/ffmpeg-transcode.cpp | 565 +++++++++++++--------------------- tests/CMakeLists.txt | 2 +- 6 files changed, 275 insertions(+), 386 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2200673d0..35c867472 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,7 @@ option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF) option(WHISPER_SDL2 "whisper: support for libSDL2" OFF) if (CMAKE_SYSTEM_NAME MATCHES "Linux") - option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF) + option(WHISPER_COMMON_FFMPEG "whisper: examples link with ffmpeg libs in order to decode more audio formats" OFF) endif() option(WHISPER_COREML "whisper: enable Core ML framework" OFF) @@ -121,6 +121,7 @@ whisper_option_depr(WARNING WHISPER_RPC GGML_RPC) whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL) whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16) whisper_option_depr(WARNING WHISPER_CCACHE GGML_CCACHE) +whisper_option_depr(WARNING WHISPER_FFMPEG WHISPER_COMMON_FFMPEG) if (GGML_CUDA AND NOT MSVC) #GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets diff --git a/README.md b/README.md index 050a35be2..d1680e99b 100644 --- a/README.md +++ b/README.md @@ -425,9 +425,10 @@ cmake -B build -DGGML_MUSA=1 -DMUSA_ARCHITECTURES="21" cmake --build build -j --config Release ``` -## FFmpeg support (Linux only) +## FFmpeg support (examples only) -If you want to support more audio formats (such as Opus and AAC), you can turn on the `WHISPER_FFMPEG` build flag to enable FFmpeg integration. +By default, the examples in this repo use the [miniaudio](https://github.com/mackron/miniaudio) library to decode audio files. +Some of the examples also can use FFmpeg for decoding and broader format support. To enable that, build with `WHISPER_COMMON_FFMPEG`. First, you need to install required libraries: @@ -442,7 +443,7 @@ sudo dnf install libavcodec-free-devel libavformat-free-devel libavutil-free-dev Then you can build the project as follows: ```bash -cmake -B build -D WHISPER_FFMPEG=yes +cmake -B build -D WHISPER_COMMON_FFMPEG=yes cmake --build build ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b202ca00b..0bb54cec4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,7 +20,7 @@ set(TARGET common) unset(COMMON_EXTRA_LIBS) -if (WHISPER_FFMPEG) +if (WHISPER_COMMON_FFMPEG) # As of cmake 3.27, there is no official cmake support for FindFFmpeg. # Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder: # whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE @@ -39,7 +39,7 @@ if (WHISPER_FFMPEG) message(STATUS "Found avformat ${AVFORMAT_VERSION}") include_directories(${FFMPEG_INCLUDE_DIRS}) - add_compile_definitions(WHISPER_FFMPEG) + add_compile_definitions(WHISPER_COMMON_FFMPEG) list(APPEND COMMON_EXTRA_LIBS ${FFMPEG_LIBRARIES}) diff --git a/examples/common-whisper.cpp b/examples/common-whisper.cpp index d29166b50..8cdd2320c 100644 --- a/examples/common-whisper.cpp +++ b/examples/common-whisper.cpp @@ -34,8 +34,8 @@ #include #include -#ifdef WHISPER_FFMPEG -// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support +#ifdef WHISPER_COMMON_FFMPEG +// as implemented in ffmpeg-trancode.cpp only embedded in common lib if whisper built with ffmpeg support extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data); #endif @@ -75,7 +75,7 @@ static bool read_audio_from_decoder(ma_decoder & decoder, std::vector & p return true; } -bool read_audio_data(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { +bool read_audio_data(const std::string & fname, std::vector & pcmf32, std::vector> & pcmf32s, bool stereo) { std::vector audio_data; // used for pipe input from stdin or ffmpeg decoding output ma_result result; @@ -96,53 +96,67 @@ bool read_audio_data(const std::string & fname, std::vector& pcmf32, std: decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE); if (fname == "-") { - #ifdef _WIN32 - _setmode(_fileno(stdin), _O_BINARY); - #endif +#ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); +#endif - uint8_t buf[1024]; - while (true) - { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - audio_data.insert(audio_data.end(), buf, buf + n); - } + uint8_t buf[1024]; + while (true) + { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + audio_data.insert(audio_data.end(), buf, buf + n); + } - result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder); + result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder); if (result != MA_SUCCESS) { - fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result)); - return false; - } + fprintf(stderr, "%s: failed to open audio data from stdin (%s)\n", __func__, ma_result_description(result)); + return false; + } decoder.initialized = true; - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); - } - else { - result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder); - if (result == MA_SUCCESS) { - decoder.initialized = true; + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); + } else { + fprintf(stderr, "%s: reading audio data from '%s' ...\n", __func__, fname.c_str()); + + // first try miniaudio. if it fails (or skipped) - try ffmpeg + { + const char * skip = getenv("WHISPER_COMMON_MINIAUDIO_SKIP"); + if (!skip || strlen(skip) == 0 || strcmp(skip, "0") == 0) { + fprintf(stderr, "%s: trying to decode with miniaudio\n", __func__); + + result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder); + if (result == MA_SUCCESS) { + decoder.initialized = true; + } + } else { + fprintf(stderr, "%s: skipping miniaudio\n", __func__); + } } -#if defined(WHISPER_FFMPEG) + +#if defined(WHISPER_COMMON_FFMPEG) if (!decoder.initialized) { + fprintf(stderr, "%s: trying to decode with ffmpeg\n", __func__); + if (ffmpeg_decode_audio(fname, audio_data) != 0) { - fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str()); + fprintf(stderr, "%s: failed to ffmpeg decode\n", __func__); return false; } result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder); if (result != MA_SUCCESS) { - fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result)); + fprintf(stderr, "%s: failed to read audio data as wav (%s)\n", __func__, ma_result_description(result)); return false; } decoder.initialized = true; } -#else - if (!decoder.initialized) { - fprintf(stderr, "error: failed to read audio data from (%s)\n", fname.c_str()); - return false; - } #endif + + if (!decoder.initialized) { + fprintf(stderr, "%s: failed to read audio data\n", __func__); + return false; + } } return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo); diff --git a/examples/ffmpeg-transcode.cpp b/examples/ffmpeg-transcode.cpp index 1fae58a4f..dc57fe745 100644 --- a/examples/ffmpeg-transcode.cpp +++ b/examples/ffmpeg-transcode.cpp @@ -1,368 +1,241 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef WHISPER_COMMON_FFMPEG -/* - * transcode.c - convert audio file to WAVE - * - * Copyright (C) 2019 Andrew Clayton - * Copyright (C) 2024 William Tambellini - */ +#include "whisper.h" -// Just for conveninent C++ API -#include #include - -// C -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include extern "C" { -#include -#include #include +#include #include } -typedef uint64_t u64; -typedef int64_t s64; -typedef uint32_t u32; -typedef int32_t s32; -typedef uint16_t u16; -typedef int16_t s16; -typedef uint8_t u8; -typedef int8_t s8; +// Write a minimal WAV header into the output buffer. +// Returns the number of bytes written (44 for a standard PCM WAV header). +static size_t wav_header_write(uint8_t * buf, int num_channels, int sample_rate, int bits_per_sample, uint32_t data_size) { + // RIFF header + memcpy(buf, "RIFF", 4); + uint32_t chunk_size = 36 + data_size; + memcpy(buf + 4, &chunk_size, 4); + memcpy(buf + 8, "WAVE", 4); -#define WAVE_SAMPLE_RATE 16000 -#define AVIO_CTX_BUF_SZ 4096 + // fmt subchunk + memcpy(buf + 12, "fmt ", 4); + uint32_t subchunk1_size = 16; + memcpy(buf + 16, &subchunk1_size, 4); + uint16_t audio_format = 1; // PCM + memcpy(buf + 20, &audio_format, 2); + memcpy(buf + 22, &num_channels, 2); + memcpy(buf + 24, &sample_rate, 4); -static const char* ffmpegLog = getenv("FFMPEG_LOG"); -// Todo: add __FILE__ __LINE__ -#define LOG(...) \ - do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99 + int bytes_per_sample = (bits_per_sample / 8) * num_channels; + int byte_rate = sample_rate * bytes_per_sample; + memcpy(buf + 28, &byte_rate, 4); + memcpy(buf + 32, &bytes_per_sample, 2); + memcpy(buf + 34, &bits_per_sample, 2); -/* - * WAVE file header based on definition from - * https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f - * - * We must ensure this structure doesn't have any holes or - * padding so we can just map it straight to the WAVE data. - */ -struct wave_hdr { - /* RIFF Header: "RIFF" */ - char riff_header[4]; - /* size of audio data + sizeof(struct wave_hdr) - 8 */ - int wav_size; - /* "WAVE" */ - char wav_header[4]; + // data subchunk + memcpy(buf + 36, "data", 4); + memcpy(buf + 40, &data_size, 4); - /* Format Header */ - /* "fmt " (includes trailing space) */ - char fmt_header[4]; - /* Should be 16 for PCM */ - int fmt_chunk_size; - /* Should be 1 for PCM. 3 for IEEE Float */ - s16 audio_format; - s16 num_channels; - int sample_rate; - /* - * Number of bytes per second - * sample_rate * num_channels * bit_depth/8 - */ - int byte_rate; - /* num_channels * bytes per sample */ - s16 sample_alignment; - /* bits per sample */ - s16 bit_depth; - - /* Data Header */ - /* "data" */ - char data_header[4]; - /* - * size of audio - * number of samples * num_channels * bit_depth/8 - */ - int data_bytes; -} __attribute__((__packed__)); - -struct audio_buffer { - u8 *ptr; - int size; /* size left in the buffer */ -}; - -static void set_wave_hdr(wave_hdr& wh, size_t size) { - memcpy(&wh.riff_header, "RIFF", 4); - wh.wav_size = size + sizeof(struct wave_hdr) - 8; - memcpy(&wh.wav_header, "WAVE", 4); - memcpy(&wh.fmt_header, "fmt ", 4); - wh.fmt_chunk_size = 16; - wh.audio_format = 1; - wh.num_channels = 1; - wh.sample_rate = WAVE_SAMPLE_RATE; - wh.sample_alignment = 2; - wh.bit_depth = 16; - wh.byte_rate = wh.sample_rate * wh.sample_alignment; - memcpy(&wh.data_header, "data", 4); - wh.data_bytes = size; + return 44; } -static void write_wave_hdr(int fd, size_t size) { - struct wave_hdr wh; - set_wave_hdr(wh, size); - write(fd, &wh, sizeof(struct wave_hdr)); -} - -static int map_file(int fd, u8 **ptr, size_t *size) -{ - struct stat sb; - - fstat(fd, &sb); - *size = sb.st_size; - - *ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); - if (*ptr == MAP_FAILED) { - perror("mmap"); - return -1; - } - - return 0; -} - -static int read_packet(void *opaque, u8 *buf, int buf_size) -{ - struct audio_buffer *audio_buf = (audio_buffer*)opaque; - - buf_size = FFMIN(buf_size, audio_buf->size); - - /* copy internal buffer data to buf */ - memcpy(buf, audio_buf->ptr, buf_size); - audio_buf->ptr += buf_size; - audio_buf->size -= buf_size; - - return buf_size; -} - -static void convert_frame(struct SwrContext *swr, AVCodecContext *codec, - AVFrame *frame, s16 **data, int *size, bool flush) -{ - int nr_samples; - s64 delay; - u8 *buffer; - - delay = swr_get_delay(swr, codec->sample_rate); - nr_samples = av_rescale_rnd(delay + frame->nb_samples, - WAVE_SAMPLE_RATE, codec->sample_rate, - AV_ROUND_UP); - av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0); - - /* - * !flush is used to check if we are flushing any remaining - * conversion buffers... - */ - nr_samples = swr_convert(swr, &buffer, nr_samples, - !flush ? (const u8 **)frame->data : NULL, - !flush ? frame->nb_samples : 0); - - *data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16)); - memcpy(*data + *size, buffer, nr_samples * sizeof(s16)); - *size += nr_samples; - av_freep(&buffer); -} - -static bool is_audio_stream(const AVStream *stream) -{ - if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) - return true; - - return false; -} - -// Return non zero on error, 0 on success -// audio_buffer: input memory -// data: decoded output audio data (wav file) -// size: size of output data -static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size) -{ - LOG("decode_audio: input size: %d\n", audio_buf->size); - AVFormatContext *fmt_ctx; - AVIOContext *avio_ctx; - AVStream *stream; - AVCodecContext *codec; - AVPacket *packet; - AVFrame *frame; - struct SwrContext *swr; - u8 *avio_ctx_buffer; - unsigned int i; - int stream_index = -1; - int err; - const size_t errbuffsize = 1024; - char errbuff[errbuffsize]; - - fmt_ctx = avformat_alloc_context(); - avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ); - LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ); - avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL); - fmt_ctx->pb = avio_ctx; - - // open the input stream and read header - err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL); - if (err) { - LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err)); - return err; - } - - err = avformat_find_stream_info(fmt_ctx, NULL); - if (err < 0) { - LOG("Could not retrieve stream info from audio buffer: %d\n", err); - return err; - } - - for (i = 0; i < fmt_ctx->nb_streams; i++) { - if (is_audio_stream(fmt_ctx->streams[i])) { - stream_index = i; - break; - } - } - - if (stream_index == -1) { - LOG("Could not retrieve audio stream from buffer\n"); - return -1; - } - - stream = fmt_ctx->streams[stream_index]; - codec = avcodec_alloc_context3( - avcodec_find_decoder(stream->codecpar->codec_id)); - avcodec_parameters_to_context(codec, stream->codecpar); - err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id), - NULL); - if (err) { - LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index); - return err; - } - - /* prepare resampler */ - swr = swr_alloc(); - -#if LIBAVCODEC_VERSION_MAJOR > 60 - AVChannelLayout in_ch_layout = codec->ch_layout; - AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO; - - /* Set the source audio layout as-is */ - av_opt_set_chlayout(swr, "in_chlayout", &in_ch_layout, 0); - av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0); - av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0); - - /* Convert it into 16khz Mono */ - av_opt_set_chlayout(swr, "out_chlayout", &out_ch_layout, 0); - av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0); - av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0); -#else - av_opt_set_int(swr, "in_channel_count", codec->channels, 0); - av_opt_set_int(swr, "out_channel_count", 1, 0); - av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0); - av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0); - av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0); - av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0); - av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0); - av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0); -#endif - - swr_init(swr); - if (!swr_is_initialized(swr)) { - LOG("Resampler has not been properly initialized\n"); - return -1; - } - - packet=av_packet_alloc(); - if (!packet) { - LOG("Error allocating the packet\n"); - return -1; - } - frame = av_frame_alloc(); - if (!frame) { - LOG("Error allocating the frame\n"); - return -1; - } - - /* iterate through frames */ - *data = NULL; - *size = 0; - while (av_read_frame(fmt_ctx, packet) >= 0) { - avcodec_send_packet(codec, packet); - - err = avcodec_receive_frame(codec, frame); - if (err == AVERROR(EAGAIN)) - continue; - - convert_frame(swr, codec, frame, data, size, false); - } - /* Flush any remaining conversion buffers... */ - convert_frame(swr, codec, frame, data, size, true); - - av_packet_free(&packet); - av_frame_free(&frame); - swr_free(&swr); - //avio_context_free(); // todo? - avcodec_free_context(&codec); - avformat_close_input(&fmt_ctx); - avformat_free_context(fmt_ctx); - - if (avio_ctx) { - av_freep(&avio_ctx->buffer); - av_freep(&avio_ctx); - } - - return 0; -} - -// in mem decoding/conversion/resampling: -// ifname: input file path -// owav_data: in mem wav file. Can be forwarded as it to whisper/drwav -// return 0 on success -int ffmpeg_decode_audio(const std::string &ifname, std::vector& owav_data) { - LOG("ffmpeg_decode_audio: %s\n", ifname.c_str()); - int ifd = open(ifname.c_str(), O_RDONLY); - if (ifd == -1) { - fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str()); - return -1; +bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data) { + { + const char * verbose = getenv("WHISPER_COMMON_FFMPEG_VERBOSE"); + if (verbose && strcmp(verbose, "2") == 0) { + av_log_set_level(AV_LOG_DEBUG); + } else if (verbose && strcmp(verbose, "1") == 0) { + av_log_set_level(AV_LOG_VERBOSE); + } else { + av_log_set_level(AV_LOG_WARNING); + } } - u8 *ibuf = NULL; - size_t ibuf_size; - int err = map_file(ifd, &ibuf, &ibuf_size); - if (err) { - LOG("Couldn't map input file %s\n", ifname.c_str()); - return err; + + AVFormatContext * fmt_ctx = nullptr; + if (avformat_open_input(&fmt_ctx, ifname.c_str(), nullptr, nullptr) != 0) { + fprintf(stderr, "error: failed to open input file '%s'\n", ifname.c_str()); + return true; } - LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size); - struct audio_buffer inaudio_buf; - inaudio_buf.ptr = ibuf; - inaudio_buf.size = ibuf_size; - s16 *odata=NULL; - int osize=0; - - err = decode_audio(&inaudio_buf, &odata, &osize); - LOG("decode_audio returned %d \n", err); - if (err != 0) { - LOG("decode_audio failed\n"); - return err; + if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) { + fprintf(stderr, "error: failed to find stream information\n"); + avformat_close_input(&fmt_ctx); + return true; } - LOG("decode_audio output size: %d\n", osize); - wave_hdr wh; - const size_t outdatasize = osize * sizeof(s16); - set_wave_hdr(wh, outdatasize); - owav_data.resize(sizeof(wave_hdr) + outdatasize); - // header: - memcpy(owav_data.data(), &wh, sizeof(wave_hdr)); - // the data: - memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16)); + // Find the first audio stream + int audio_stream_idx = -1; + for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) { + if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) { + audio_stream_idx = i; + break; + } + } - return 0; + if (audio_stream_idx == -1) { + fprintf(stderr, "error: failed to find an audio stream in '%s'\n", ifname.c_str()); + avformat_close_input(&fmt_ctx); + return true; + } + + AVStream * audio_stream = fmt_ctx->streams[audio_stream_idx]; + + // Open the decoder + const AVCodec * codec = avcodec_find_decoder(audio_stream->codecpar->codec_id); + if (!codec) { + fprintf(stderr, "error: failed to find decoder for codec id %d\n", audio_stream->codecpar->codec_id); + avformat_close_input(&fmt_ctx); + return true; + } + + AVCodecContext * codec_ctx = avcodec_alloc_context3(codec); + if (!codec_ctx) { + fprintf(stderr, "error: failed to allocate codec context\n"); + avformat_close_input(&fmt_ctx); + return true; + } + + if (avcodec_parameters_to_context(codec_ctx, audio_stream->codecpar) < 0) { + fprintf(stderr, "error: failed to copy codec parameters to context\n"); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + return true; + } + + if (avcodec_open2(codec_ctx, codec, nullptr) < 0) { + fprintf(stderr, "error: failed to open codec\n"); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + return true; + } + + // Setup resampler: convert to 16-bit signed PCM, mono, 16000 Hz + const enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_S16; + const int out_sample_rate = WHISPER_SAMPLE_RATE; + + AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO; + + SwrContext * swr_ctx = nullptr; + if (swr_alloc_set_opts2(&swr_ctx, &out_ch_layout, out_sample_fmt, out_sample_rate, + &codec_ctx->ch_layout, codec_ctx->sample_fmt, codec_ctx->sample_rate, + 0, nullptr) < 0) { + fprintf(stderr, "error: failed to allocate swr context\n"); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + return true; + } + + if (swr_init(swr_ctx) < 0) { + fprintf(stderr, "error: failed to initialize swr context\n"); + swr_free(&swr_ctx); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + return true; + } + + // Decode and resample + AVPacket * packet = av_packet_alloc(); + AVFrame * frame = av_frame_alloc(); + + // Buffer to collect resampled output + std::vector pcm_data; + + // Max output samples per swr_convert call + const int max_out_samples = 16 * 1024; + std::vector out_buffer(max_out_samples); + + while (av_read_frame(fmt_ctx, packet) >= 0) { + if (packet->stream_index != audio_stream_idx) { + av_packet_unref(packet); + continue; + } + + int ret = avcodec_send_packet(codec_ctx, packet); + av_packet_unref(packet); + + if (ret < 0) { + continue; + } + + while (ret >= 0) { + ret = avcodec_receive_frame(codec_ctx, frame); + if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { + break; + } + if (ret < 0) { + break; + } + + // Resample + int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples, + out_sample_rate, out_sample_rate, AV_ROUND_UP); + if (out_samples > (int)out_buffer.size()) { + out_buffer.resize(out_samples); + } + + const uint8_t * in_data[16] = {0}; + for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) { + in_data[p] = frame->data[p]; + } + uint8_t * out_data[16] = {0}; + out_data[0] = (uint8_t *)out_buffer.data(); + + int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples); + if (got_samples > 0) { + pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples); + } + } + } + + // Flush the decoder + avcodec_send_packet(codec_ctx, nullptr); + while (avcodec_receive_frame(codec_ctx, frame) >= 0) { + int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples, + out_sample_rate, out_sample_rate, AV_ROUND_UP); + if (out_samples > (int)out_buffer.size()) { + out_buffer.resize(out_samples); + } + const uint8_t * in_data[16] = {0}; + for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) { + in_data[p] = frame->data[p]; + } + uint8_t * out_data[16] = {0}; + out_data[0] = (uint8_t *)out_buffer.data(); + + int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples); + if (got_samples > 0) { + pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples); + } + } + + // Flush the resampler + uint8_t * out_data[16] = {0}; + out_data[0] = (uint8_t *)out_buffer.data(); + int flush_samples = swr_convert(swr_ctx, out_data, max_out_samples, nullptr, 0); + if (flush_samples > 0) { + pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + flush_samples); + } + + // Build WAV output + uint32_t data_size = pcm_data.size() * sizeof(int16_t); + wav_data.resize(44 + data_size); + + wav_header_write(wav_data.data(), 1, out_sample_rate, 16, data_size); + memcpy(wav_data.data() + 44, pcm_data.data(), data_size); + + // Cleanup + av_frame_free(&frame); + av_packet_free(&packet); + swr_free(&swr_ctx); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + + return false; // success } + +#endif // WHISPER_COMMON_FFMPEG diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 09e77ea89..0593b748d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,7 +78,7 @@ add_test(NAME ${TEST_TARGET} -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav) set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large") -if (WHISPER_FFMPEG) +if (WHISPER_COMMON_FFMPEG) set(TEST_TARGET test-whisper-cli-tiny-mp3) # Check with reviewers: any way to check the output transcription via ctest (diff, ...)? add_test(NAME ${TEST_TARGET}