common : re-implement `ffmpeg-transcode.cpp` + clarify ffmpeg usage (#3846)
* examples : remove ffmpeg-transcode.cpp * examples : implement ffmpeg-transcode.cpp Assisted-by: llama.cpp:local pi * common : switch from WHISPER_FFMPEG -> WHISPER_COMMON_FFMPEG
This commit is contained in:
parent
f24588a272
commit
f39cc71282
|
|
@ -85,7 +85,7 @@ option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
|
|||
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
|
||||
option(WHISPER_COMMON_FFMPEG "whisper: examples link with ffmpeg libs in order to decode more audio formats" OFF)
|
||||
endif()
|
||||
|
||||
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
||||
|
|
@ -121,6 +121,7 @@ whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
|
|||
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
|
||||
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
|
||||
whisper_option_depr(WARNING WHISPER_CCACHE GGML_CCACHE)
|
||||
whisper_option_depr(WARNING WHISPER_FFMPEG WHISPER_COMMON_FFMPEG)
|
||||
|
||||
if (GGML_CUDA AND NOT MSVC)
|
||||
#GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets
|
||||
|
|
|
|||
|
|
@ -425,9 +425,10 @@ cmake -B build -DGGML_MUSA=1 -DMUSA_ARCHITECTURES="21"
|
|||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
## FFmpeg support (Linux only)
|
||||
## FFmpeg support (examples only)
|
||||
|
||||
If you want to support more audio formats (such as Opus and AAC), you can turn on the `WHISPER_FFMPEG` build flag to enable FFmpeg integration.
|
||||
By default, the examples in this repo use the [miniaudio](https://github.com/mackron/miniaudio) library to decode audio files.
|
||||
Some of the examples also can use FFmpeg for decoding and broader format support. To enable that, build with `WHISPER_COMMON_FFMPEG`.
|
||||
|
||||
First, you need to install required libraries:
|
||||
|
||||
|
|
@ -442,7 +443,7 @@ sudo dnf install libavcodec-free-devel libavformat-free-devel libavutil-free-dev
|
|||
Then you can build the project as follows:
|
||||
|
||||
```bash
|
||||
cmake -B build -D WHISPER_FFMPEG=yes
|
||||
cmake -B build -D WHISPER_COMMON_FFMPEG=yes
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ set(TARGET common)
|
|||
|
||||
unset(COMMON_EXTRA_LIBS)
|
||||
|
||||
if (WHISPER_FFMPEG)
|
||||
if (WHISPER_COMMON_FFMPEG)
|
||||
# As of cmake 3.27, there is no official cmake support for FindFFmpeg.
|
||||
# Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
|
||||
# whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
|
||||
|
|
@ -39,7 +39,7 @@ if (WHISPER_FFMPEG)
|
|||
message(STATUS "Found avformat ${AVFORMAT_VERSION}")
|
||||
|
||||
include_directories(${FFMPEG_INCLUDE_DIRS})
|
||||
add_compile_definitions(WHISPER_FFMPEG)
|
||||
add_compile_definitions(WHISPER_COMMON_FFMPEG)
|
||||
|
||||
list(APPEND COMMON_EXTRA_LIBS ${FFMPEG_LIBRARIES})
|
||||
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@
|
|||
#include <cstring>
|
||||
#include <fstream>
|
||||
|
||||
#ifdef WHISPER_FFMPEG
|
||||
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
|
||||
#ifdef WHISPER_COMMON_FFMPEG
|
||||
// as implemented in ffmpeg-trancode.cpp only embedded in common lib if whisper built with ffmpeg support
|
||||
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
|
||||
#endif
|
||||
|
||||
|
|
@ -75,7 +75,7 @@ static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & p
|
|||
return true;
|
||||
}
|
||||
|
||||
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
||||
bool read_audio_data(const std::string & fname, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
|
||||
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
|
||||
|
||||
ma_result result;
|
||||
|
|
@ -96,53 +96,67 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
|
|||
decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
|
||||
|
||||
if (fname == "-") {
|
||||
#ifdef _WIN32
|
||||
_setmode(_fileno(stdin), _O_BINARY);
|
||||
#endif
|
||||
#ifdef _WIN32
|
||||
_setmode(_fileno(stdin), _O_BINARY);
|
||||
#endif
|
||||
|
||||
uint8_t buf[1024];
|
||||
while (true)
|
||||
{
|
||||
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
audio_data.insert(audio_data.end(), buf, buf + n);
|
||||
}
|
||||
uint8_t buf[1024];
|
||||
while (true)
|
||||
{
|
||||
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
audio_data.insert(audio_data.end(), buf, buf + n);
|
||||
}
|
||||
|
||||
result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder);
|
||||
result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "%s: failed to open audio data from stdin (%s)\n", __func__, ma_result_description(result));
|
||||
return false;
|
||||
}
|
||||
decoder.initialized = true;
|
||||
|
||||
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
|
||||
}
|
||||
else {
|
||||
result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder);
|
||||
if (result == MA_SUCCESS) {
|
||||
decoder.initialized = true;
|
||||
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
|
||||
} else {
|
||||
fprintf(stderr, "%s: reading audio data from '%s' ...\n", __func__, fname.c_str());
|
||||
|
||||
// first try miniaudio. if it fails (or skipped) - try ffmpeg
|
||||
{
|
||||
const char * skip = getenv("WHISPER_COMMON_MINIAUDIO_SKIP");
|
||||
if (!skip || strlen(skip) == 0 || strcmp(skip, "0") == 0) {
|
||||
fprintf(stderr, "%s: trying to decode with miniaudio\n", __func__);
|
||||
|
||||
result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder);
|
||||
if (result == MA_SUCCESS) {
|
||||
decoder.initialized = true;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "%s: skipping miniaudio\n", __func__);
|
||||
}
|
||||
}
|
||||
#if defined(WHISPER_FFMPEG)
|
||||
|
||||
#if defined(WHISPER_COMMON_FFMPEG)
|
||||
if (!decoder.initialized) {
|
||||
fprintf(stderr, "%s: trying to decode with ffmpeg\n", __func__);
|
||||
|
||||
if (ffmpeg_decode_audio(fname, audio_data) != 0) {
|
||||
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
|
||||
fprintf(stderr, "%s: failed to ffmpeg decode\n", __func__);
|
||||
return false;
|
||||
}
|
||||
result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));
|
||||
fprintf(stderr, "%s: failed to read audio data as wav (%s)\n", __func__, ma_result_description(result));
|
||||
return false;
|
||||
}
|
||||
decoder.initialized = true;
|
||||
}
|
||||
#else
|
||||
if (!decoder.initialized) {
|
||||
fprintf(stderr, "error: failed to read audio data from (%s)\n", fname.c_str());
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!decoder.initialized) {
|
||||
fprintf(stderr, "%s: failed to read audio data\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
|
||||
|
|
|
|||
|
|
@ -1,368 +1,241 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifdef WHISPER_COMMON_FFMPEG
|
||||
|
||||
/*
|
||||
* transcode.c - convert audio file to WAVE
|
||||
*
|
||||
* Copyright (C) 2019 Andrew Clayton <andrew@digital-domain.net>
|
||||
* Copyright (C) 2024 William Tambellini <william.tambellini@gmail.com>
|
||||
*/
|
||||
#include "whisper.h"
|
||||
|
||||
// Just for conveninent C++ API
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
// C
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
extern "C" {
|
||||
#include <libavutil/opt.h>
|
||||
#include <libavcodec/avcodec.h>
|
||||
#include <libavformat/avformat.h>
|
||||
#include <libavcodec/avcodec.h>
|
||||
#include <libswresample/swresample.h>
|
||||
}
|
||||
|
||||
typedef uint64_t u64;
|
||||
typedef int64_t s64;
|
||||
typedef uint32_t u32;
|
||||
typedef int32_t s32;
|
||||
typedef uint16_t u16;
|
||||
typedef int16_t s16;
|
||||
typedef uint8_t u8;
|
||||
typedef int8_t s8;
|
||||
// Write a minimal WAV header into the output buffer.
|
||||
// Returns the number of bytes written (44 for a standard PCM WAV header).
|
||||
static size_t wav_header_write(uint8_t * buf, int num_channels, int sample_rate, int bits_per_sample, uint32_t data_size) {
|
||||
// RIFF header
|
||||
memcpy(buf, "RIFF", 4);
|
||||
uint32_t chunk_size = 36 + data_size;
|
||||
memcpy(buf + 4, &chunk_size, 4);
|
||||
memcpy(buf + 8, "WAVE", 4);
|
||||
|
||||
#define WAVE_SAMPLE_RATE 16000
|
||||
#define AVIO_CTX_BUF_SZ 4096
|
||||
// fmt subchunk
|
||||
memcpy(buf + 12, "fmt ", 4);
|
||||
uint32_t subchunk1_size = 16;
|
||||
memcpy(buf + 16, &subchunk1_size, 4);
|
||||
uint16_t audio_format = 1; // PCM
|
||||
memcpy(buf + 20, &audio_format, 2);
|
||||
memcpy(buf + 22, &num_channels, 2);
|
||||
memcpy(buf + 24, &sample_rate, 4);
|
||||
|
||||
static const char* ffmpegLog = getenv("FFMPEG_LOG");
|
||||
// Todo: add __FILE__ __LINE__
|
||||
#define LOG(...) \
|
||||
do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99
|
||||
int bytes_per_sample = (bits_per_sample / 8) * num_channels;
|
||||
int byte_rate = sample_rate * bytes_per_sample;
|
||||
memcpy(buf + 28, &byte_rate, 4);
|
||||
memcpy(buf + 32, &bytes_per_sample, 2);
|
||||
memcpy(buf + 34, &bits_per_sample, 2);
|
||||
|
||||
/*
|
||||
* WAVE file header based on definition from
|
||||
* https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f
|
||||
*
|
||||
* We must ensure this structure doesn't have any holes or
|
||||
* padding so we can just map it straight to the WAVE data.
|
||||
*/
|
||||
struct wave_hdr {
|
||||
/* RIFF Header: "RIFF" */
|
||||
char riff_header[4];
|
||||
/* size of audio data + sizeof(struct wave_hdr) - 8 */
|
||||
int wav_size;
|
||||
/* "WAVE" */
|
||||
char wav_header[4];
|
||||
// data subchunk
|
||||
memcpy(buf + 36, "data", 4);
|
||||
memcpy(buf + 40, &data_size, 4);
|
||||
|
||||
/* Format Header */
|
||||
/* "fmt " (includes trailing space) */
|
||||
char fmt_header[4];
|
||||
/* Should be 16 for PCM */
|
||||
int fmt_chunk_size;
|
||||
/* Should be 1 for PCM. 3 for IEEE Float */
|
||||
s16 audio_format;
|
||||
s16 num_channels;
|
||||
int sample_rate;
|
||||
/*
|
||||
* Number of bytes per second
|
||||
* sample_rate * num_channels * bit_depth/8
|
||||
*/
|
||||
int byte_rate;
|
||||
/* num_channels * bytes per sample */
|
||||
s16 sample_alignment;
|
||||
/* bits per sample */
|
||||
s16 bit_depth;
|
||||
|
||||
/* Data Header */
|
||||
/* "data" */
|
||||
char data_header[4];
|
||||
/*
|
||||
* size of audio
|
||||
* number of samples * num_channels * bit_depth/8
|
||||
*/
|
||||
int data_bytes;
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct audio_buffer {
|
||||
u8 *ptr;
|
||||
int size; /* size left in the buffer */
|
||||
};
|
||||
|
||||
static void set_wave_hdr(wave_hdr& wh, size_t size) {
|
||||
memcpy(&wh.riff_header, "RIFF", 4);
|
||||
wh.wav_size = size + sizeof(struct wave_hdr) - 8;
|
||||
memcpy(&wh.wav_header, "WAVE", 4);
|
||||
memcpy(&wh.fmt_header, "fmt ", 4);
|
||||
wh.fmt_chunk_size = 16;
|
||||
wh.audio_format = 1;
|
||||
wh.num_channels = 1;
|
||||
wh.sample_rate = WAVE_SAMPLE_RATE;
|
||||
wh.sample_alignment = 2;
|
||||
wh.bit_depth = 16;
|
||||
wh.byte_rate = wh.sample_rate * wh.sample_alignment;
|
||||
memcpy(&wh.data_header, "data", 4);
|
||||
wh.data_bytes = size;
|
||||
return 44;
|
||||
}
|
||||
|
||||
static void write_wave_hdr(int fd, size_t size) {
|
||||
struct wave_hdr wh;
|
||||
set_wave_hdr(wh, size);
|
||||
write(fd, &wh, sizeof(struct wave_hdr));
|
||||
}
|
||||
|
||||
static int map_file(int fd, u8 **ptr, size_t *size)
|
||||
{
|
||||
struct stat sb;
|
||||
|
||||
fstat(fd, &sb);
|
||||
*size = sb.st_size;
|
||||
|
||||
*ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
|
||||
if (*ptr == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_packet(void *opaque, u8 *buf, int buf_size)
|
||||
{
|
||||
struct audio_buffer *audio_buf = (audio_buffer*)opaque;
|
||||
|
||||
buf_size = FFMIN(buf_size, audio_buf->size);
|
||||
|
||||
/* copy internal buffer data to buf */
|
||||
memcpy(buf, audio_buf->ptr, buf_size);
|
||||
audio_buf->ptr += buf_size;
|
||||
audio_buf->size -= buf_size;
|
||||
|
||||
return buf_size;
|
||||
}
|
||||
|
||||
static void convert_frame(struct SwrContext *swr, AVCodecContext *codec,
|
||||
AVFrame *frame, s16 **data, int *size, bool flush)
|
||||
{
|
||||
int nr_samples;
|
||||
s64 delay;
|
||||
u8 *buffer;
|
||||
|
||||
delay = swr_get_delay(swr, codec->sample_rate);
|
||||
nr_samples = av_rescale_rnd(delay + frame->nb_samples,
|
||||
WAVE_SAMPLE_RATE, codec->sample_rate,
|
||||
AV_ROUND_UP);
|
||||
av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0);
|
||||
|
||||
/*
|
||||
* !flush is used to check if we are flushing any remaining
|
||||
* conversion buffers...
|
||||
*/
|
||||
nr_samples = swr_convert(swr, &buffer, nr_samples,
|
||||
!flush ? (const u8 **)frame->data : NULL,
|
||||
!flush ? frame->nb_samples : 0);
|
||||
|
||||
*data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16));
|
||||
memcpy(*data + *size, buffer, nr_samples * sizeof(s16));
|
||||
*size += nr_samples;
|
||||
av_freep(&buffer);
|
||||
}
|
||||
|
||||
static bool is_audio_stream(const AVStream *stream)
|
||||
{
|
||||
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Return non zero on error, 0 on success
|
||||
// audio_buffer: input memory
|
||||
// data: decoded output audio data (wav file)
|
||||
// size: size of output data
|
||||
static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
|
||||
{
|
||||
LOG("decode_audio: input size: %d\n", audio_buf->size);
|
||||
AVFormatContext *fmt_ctx;
|
||||
AVIOContext *avio_ctx;
|
||||
AVStream *stream;
|
||||
AVCodecContext *codec;
|
||||
AVPacket *packet;
|
||||
AVFrame *frame;
|
||||
struct SwrContext *swr;
|
||||
u8 *avio_ctx_buffer;
|
||||
unsigned int i;
|
||||
int stream_index = -1;
|
||||
int err;
|
||||
const size_t errbuffsize = 1024;
|
||||
char errbuff[errbuffsize];
|
||||
|
||||
fmt_ctx = avformat_alloc_context();
|
||||
avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
|
||||
LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
|
||||
avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL);
|
||||
fmt_ctx->pb = avio_ctx;
|
||||
|
||||
// open the input stream and read header
|
||||
err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL);
|
||||
if (err) {
|
||||
LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err));
|
||||
return err;
|
||||
}
|
||||
|
||||
err = avformat_find_stream_info(fmt_ctx, NULL);
|
||||
if (err < 0) {
|
||||
LOG("Could not retrieve stream info from audio buffer: %d\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
for (i = 0; i < fmt_ctx->nb_streams; i++) {
|
||||
if (is_audio_stream(fmt_ctx->streams[i])) {
|
||||
stream_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (stream_index == -1) {
|
||||
LOG("Could not retrieve audio stream from buffer\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
stream = fmt_ctx->streams[stream_index];
|
||||
codec = avcodec_alloc_context3(
|
||||
avcodec_find_decoder(stream->codecpar->codec_id));
|
||||
avcodec_parameters_to_context(codec, stream->codecpar);
|
||||
err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id),
|
||||
NULL);
|
||||
if (err) {
|
||||
LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* prepare resampler */
|
||||
swr = swr_alloc();
|
||||
|
||||
#if LIBAVCODEC_VERSION_MAJOR > 60
|
||||
AVChannelLayout in_ch_layout = codec->ch_layout;
|
||||
AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
|
||||
|
||||
/* Set the source audio layout as-is */
|
||||
av_opt_set_chlayout(swr, "in_chlayout", &in_ch_layout, 0);
|
||||
av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0);
|
||||
av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0);
|
||||
|
||||
/* Convert it into 16khz Mono */
|
||||
av_opt_set_chlayout(swr, "out_chlayout", &out_ch_layout, 0);
|
||||
av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0);
|
||||
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
|
||||
#else
|
||||
av_opt_set_int(swr, "in_channel_count", codec->channels, 0);
|
||||
av_opt_set_int(swr, "out_channel_count", 1, 0);
|
||||
av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0);
|
||||
av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0);
|
||||
av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0);
|
||||
av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0);
|
||||
av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0);
|
||||
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
|
||||
#endif
|
||||
|
||||
swr_init(swr);
|
||||
if (!swr_is_initialized(swr)) {
|
||||
LOG("Resampler has not been properly initialized\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
packet=av_packet_alloc();
|
||||
if (!packet) {
|
||||
LOG("Error allocating the packet\n");
|
||||
return -1;
|
||||
}
|
||||
frame = av_frame_alloc();
|
||||
if (!frame) {
|
||||
LOG("Error allocating the frame\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* iterate through frames */
|
||||
*data = NULL;
|
||||
*size = 0;
|
||||
while (av_read_frame(fmt_ctx, packet) >= 0) {
|
||||
avcodec_send_packet(codec, packet);
|
||||
|
||||
err = avcodec_receive_frame(codec, frame);
|
||||
if (err == AVERROR(EAGAIN))
|
||||
continue;
|
||||
|
||||
convert_frame(swr, codec, frame, data, size, false);
|
||||
}
|
||||
/* Flush any remaining conversion buffers... */
|
||||
convert_frame(swr, codec, frame, data, size, true);
|
||||
|
||||
av_packet_free(&packet);
|
||||
av_frame_free(&frame);
|
||||
swr_free(&swr);
|
||||
//avio_context_free(); // todo?
|
||||
avcodec_free_context(&codec);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
avformat_free_context(fmt_ctx);
|
||||
|
||||
if (avio_ctx) {
|
||||
av_freep(&avio_ctx->buffer);
|
||||
av_freep(&avio_ctx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// in mem decoding/conversion/resampling:
|
||||
// ifname: input file path
|
||||
// owav_data: in mem wav file. Can be forwarded as it to whisper/drwav
|
||||
// return 0 on success
|
||||
int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_data) {
|
||||
LOG("ffmpeg_decode_audio: %s\n", ifname.c_str());
|
||||
int ifd = open(ifname.c_str(), O_RDONLY);
|
||||
if (ifd == -1) {
|
||||
fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str());
|
||||
return -1;
|
||||
bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data) {
|
||||
{
|
||||
const char * verbose = getenv("WHISPER_COMMON_FFMPEG_VERBOSE");
|
||||
if (verbose && strcmp(verbose, "2") == 0) {
|
||||
av_log_set_level(AV_LOG_DEBUG);
|
||||
} else if (verbose && strcmp(verbose, "1") == 0) {
|
||||
av_log_set_level(AV_LOG_VERBOSE);
|
||||
} else {
|
||||
av_log_set_level(AV_LOG_WARNING);
|
||||
}
|
||||
}
|
||||
u8 *ibuf = NULL;
|
||||
size_t ibuf_size;
|
||||
int err = map_file(ifd, &ibuf, &ibuf_size);
|
||||
if (err) {
|
||||
LOG("Couldn't map input file %s\n", ifname.c_str());
|
||||
return err;
|
||||
|
||||
AVFormatContext * fmt_ctx = nullptr;
|
||||
if (avformat_open_input(&fmt_ctx, ifname.c_str(), nullptr, nullptr) != 0) {
|
||||
fprintf(stderr, "error: failed to open input file '%s'\n", ifname.c_str());
|
||||
return true;
|
||||
}
|
||||
LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
|
||||
struct audio_buffer inaudio_buf;
|
||||
inaudio_buf.ptr = ibuf;
|
||||
inaudio_buf.size = ibuf_size;
|
||||
|
||||
s16 *odata=NULL;
|
||||
int osize=0;
|
||||
|
||||
err = decode_audio(&inaudio_buf, &odata, &osize);
|
||||
LOG("decode_audio returned %d \n", err);
|
||||
if (err != 0) {
|
||||
LOG("decode_audio failed\n");
|
||||
return err;
|
||||
if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
|
||||
fprintf(stderr, "error: failed to find stream information\n");
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
LOG("decode_audio output size: %d\n", osize);
|
||||
|
||||
wave_hdr wh;
|
||||
const size_t outdatasize = osize * sizeof(s16);
|
||||
set_wave_hdr(wh, outdatasize);
|
||||
owav_data.resize(sizeof(wave_hdr) + outdatasize);
|
||||
// header:
|
||||
memcpy(owav_data.data(), &wh, sizeof(wave_hdr));
|
||||
// the data:
|
||||
memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
|
||||
// Find the first audio stream
|
||||
int audio_stream_idx = -1;
|
||||
for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) {
|
||||
if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
||||
audio_stream_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
if (audio_stream_idx == -1) {
|
||||
fprintf(stderr, "error: failed to find an audio stream in '%s'\n", ifname.c_str());
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
AVStream * audio_stream = fmt_ctx->streams[audio_stream_idx];
|
||||
|
||||
// Open the decoder
|
||||
const AVCodec * codec = avcodec_find_decoder(audio_stream->codecpar->codec_id);
|
||||
if (!codec) {
|
||||
fprintf(stderr, "error: failed to find decoder for codec id %d\n", audio_stream->codecpar->codec_id);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
AVCodecContext * codec_ctx = avcodec_alloc_context3(codec);
|
||||
if (!codec_ctx) {
|
||||
fprintf(stderr, "error: failed to allocate codec context\n");
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (avcodec_parameters_to_context(codec_ctx, audio_stream->codecpar) < 0) {
|
||||
fprintf(stderr, "error: failed to copy codec parameters to context\n");
|
||||
avcodec_free_context(&codec_ctx);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (avcodec_open2(codec_ctx, codec, nullptr) < 0) {
|
||||
fprintf(stderr, "error: failed to open codec\n");
|
||||
avcodec_free_context(&codec_ctx);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Setup resampler: convert to 16-bit signed PCM, mono, 16000 Hz
|
||||
const enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_S16;
|
||||
const int out_sample_rate = WHISPER_SAMPLE_RATE;
|
||||
|
||||
AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
|
||||
|
||||
SwrContext * swr_ctx = nullptr;
|
||||
if (swr_alloc_set_opts2(&swr_ctx, &out_ch_layout, out_sample_fmt, out_sample_rate,
|
||||
&codec_ctx->ch_layout, codec_ctx->sample_fmt, codec_ctx->sample_rate,
|
||||
0, nullptr) < 0) {
|
||||
fprintf(stderr, "error: failed to allocate swr context\n");
|
||||
avcodec_free_context(&codec_ctx);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (swr_init(swr_ctx) < 0) {
|
||||
fprintf(stderr, "error: failed to initialize swr context\n");
|
||||
swr_free(&swr_ctx);
|
||||
avcodec_free_context(&codec_ctx);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Decode and resample
|
||||
AVPacket * packet = av_packet_alloc();
|
||||
AVFrame * frame = av_frame_alloc();
|
||||
|
||||
// Buffer to collect resampled output
|
||||
std::vector<int16_t> pcm_data;
|
||||
|
||||
// Max output samples per swr_convert call
|
||||
const int max_out_samples = 16 * 1024;
|
||||
std::vector<int16_t> out_buffer(max_out_samples);
|
||||
|
||||
while (av_read_frame(fmt_ctx, packet) >= 0) {
|
||||
if (packet->stream_index != audio_stream_idx) {
|
||||
av_packet_unref(packet);
|
||||
continue;
|
||||
}
|
||||
|
||||
int ret = avcodec_send_packet(codec_ctx, packet);
|
||||
av_packet_unref(packet);
|
||||
|
||||
if (ret < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
while (ret >= 0) {
|
||||
ret = avcodec_receive_frame(codec_ctx, frame);
|
||||
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
|
||||
break;
|
||||
}
|
||||
if (ret < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Resample
|
||||
int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples,
|
||||
out_sample_rate, out_sample_rate, AV_ROUND_UP);
|
||||
if (out_samples > (int)out_buffer.size()) {
|
||||
out_buffer.resize(out_samples);
|
||||
}
|
||||
|
||||
const uint8_t * in_data[16] = {0};
|
||||
for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) {
|
||||
in_data[p] = frame->data[p];
|
||||
}
|
||||
uint8_t * out_data[16] = {0};
|
||||
out_data[0] = (uint8_t *)out_buffer.data();
|
||||
|
||||
int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples);
|
||||
if (got_samples > 0) {
|
||||
pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Flush the decoder
|
||||
avcodec_send_packet(codec_ctx, nullptr);
|
||||
while (avcodec_receive_frame(codec_ctx, frame) >= 0) {
|
||||
int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples,
|
||||
out_sample_rate, out_sample_rate, AV_ROUND_UP);
|
||||
if (out_samples > (int)out_buffer.size()) {
|
||||
out_buffer.resize(out_samples);
|
||||
}
|
||||
const uint8_t * in_data[16] = {0};
|
||||
for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) {
|
||||
in_data[p] = frame->data[p];
|
||||
}
|
||||
uint8_t * out_data[16] = {0};
|
||||
out_data[0] = (uint8_t *)out_buffer.data();
|
||||
|
||||
int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples);
|
||||
if (got_samples > 0) {
|
||||
pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples);
|
||||
}
|
||||
}
|
||||
|
||||
// Flush the resampler
|
||||
uint8_t * out_data[16] = {0};
|
||||
out_data[0] = (uint8_t *)out_buffer.data();
|
||||
int flush_samples = swr_convert(swr_ctx, out_data, max_out_samples, nullptr, 0);
|
||||
if (flush_samples > 0) {
|
||||
pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + flush_samples);
|
||||
}
|
||||
|
||||
// Build WAV output
|
||||
uint32_t data_size = pcm_data.size() * sizeof(int16_t);
|
||||
wav_data.resize(44 + data_size);
|
||||
|
||||
wav_header_write(wav_data.data(), 1, out_sample_rate, 16, data_size);
|
||||
memcpy(wav_data.data() + 44, pcm_data.data(), data_size);
|
||||
|
||||
// Cleanup
|
||||
av_frame_free(&frame);
|
||||
av_packet_free(&packet);
|
||||
swr_free(&swr_ctx);
|
||||
avcodec_free_context(&codec_ctx);
|
||||
avformat_close_input(&fmt_ctx);
|
||||
|
||||
return false; // success
|
||||
}
|
||||
|
||||
#endif // WHISPER_COMMON_FFMPEG
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ add_test(NAME ${TEST_TARGET}
|
|||
-f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
|
||||
set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
|
||||
|
||||
if (WHISPER_FFMPEG)
|
||||
if (WHISPER_COMMON_FFMPEG)
|
||||
set(TEST_TARGET test-whisper-cli-tiny-mp3)
|
||||
# Check with reviewers: any way to check the output transcription via ctest (diff, ...)?
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
|
|
|
|||
Loading…
Reference in New Issue