This commit is contained in:
Daniel Bevenius 2026-04-24 09:06:22 +00:00 committed by GitHub
commit 03443a93f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 6007 additions and 1 deletions

View File

@ -634,6 +634,8 @@ jobs:
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DBUILD_SHARED_LIBS=ON
-DWHISPER_SDL2=${{ matrix.sdl2 }}
-DGGML_NATIVE=OFF
-DGGML_BMI2=OFF
- name: Build
run: |

View File

@ -179,12 +179,20 @@ set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location
get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
target_compile_definitions(whisper PRIVATE
WHISPER_VERSION="${PROJECT_VERSION}"
)
set_target_properties(parakeet PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/parakeet.h)
install(TARGETS parakeet LIBRARY PUBLIC_HEADER)
target_compile_definitions(parakeet PRIVATE
PARAKEET_VERSION="${PROJECT_VERSION}"
)
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
@ -210,6 +218,35 @@ configure_file(cmake/whisper.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
DESTINATION lib/pkgconfig)
set(PARAKEET_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(PARAKEET_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(PARAKEET_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/parakeet-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/parakeet-config.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parakeet
PATH_VARS
PARAKEET_INCLUDE_INSTALL_DIR
PARAKEET_LIB_INSTALL_DIR
PARAKEET_BIN_INSTALL_DIR)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/parakeet-version.cmake
VERSION ${WHISPER_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/parakeet-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/parakeet-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parakeet)
configure_file(cmake/parakeet.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/parakeet.pc"
@ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parakeet.pc"
DESTINATION lib/pkgconfig)
#
# programs, examples and tests
#

View File

@ -18,6 +18,6 @@ create_makefile "whisper" do |conf|
#{libs}: cmake-targets
cmake-targets:
#{"\t"}#{cmake} -S sources -B build -D BUILD_SHARED_LIBS=OFF -D CMAKE_ARCHIVE_OUTPUT_DIRECTORY=#{__dir__} -D CMAKE_POSITION_INDEPENDENT_CODE=ON #{options}
#{"\t"}#{cmake} --build build --config Release --target common whisper
#{"\t"}#{cmake} --build build --config Release --target common whisper parakeet
EOF
end

View File

@ -0,0 +1,30 @@
set(PARAKEET_VERSION @WHISPER_INSTALL_VERSION@)
set(PARAKEET_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
set(PARAKEET_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
set(PARAKEET_SHARED_LIB @BUILD_SHARED_LIBS@)
@PACKAGE_INIT@
set_and_check(PARAKEET_INCLUDE_DIR "@PACKAGE_PARAKEET_INCLUDE_INSTALL_DIR@")
set_and_check(PARAKEET_LIB_DIR "@PACKAGE_PARAKEET_LIB_INSTALL_DIR@")
set_and_check(PARAKEET_BIN_DIR "@PACKAGE_PARAKEET_BIN_INSTALL_DIR@")
find_package(ggml REQUIRED HINTS ${PARAKEET_LIB_DIR}/cmake)
find_library(parakeet_LIBRARY parakeet
REQUIRED
HINTS ${PARAKEET_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
add_library(parakeet UNKNOWN IMPORTED)
set_target_properties(parakeet
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${PARAKEET_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${parakeet_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON)
check_required_components(parakeet)

10
cmake/parakeet.pc.in Normal file
View File

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: parakeet
Description: Port of NVIDIA's Parakeet model in C/C++
Version: @PROJECT_VERSION@
Libs: -L${libdir} -lggml -lggml-base -lparakeet
Cflags: -I${includedir}

View File

@ -107,6 +107,7 @@ else()
add_subdirectory(server)
add_subdirectory(quantize)
add_subdirectory(vad-speech-segments)
add_subdirectory(parakeet-cli)
if (WHISPER_SDL2)
add_subdirectory(stream)
add_subdirectory(command)

View File

@ -0,0 +1,8 @@
set(TARGET parakeet-cli)
add_executable(${TARGET} parakeet-cli.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common parakeet ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS ${TARGET} RUNTIME)

View File

@ -0,0 +1,112 @@
# whisper.cpp/examples/parakeet-cli
This is an example of using the [Parakeet] model in whisper.cpp.
### Download converted model
```console
$ hf download danbev/parakeet parakeet-tdt-0.6b-v3.bin --local-dir models
```
### Building
```console
$ cmake -B build -S .
$ cmake --build build --target parakeet-cli -j 12
```
### Usage
```console
$ ./build/bin/parakeet-cli --help
usage: ./build/bin/parakeet-cli [options] file0 file1 ...
supported audio formats: flac, mp3, ogg, wav
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-cl N, --chunk-length N [10000 ] chunk length in milliseconds
-lc N, --left-context N [10000 ] left context in milliseconds
-rc N, --right-context N [4960 ] right context in milliseconds
-m, --model FILE [models/ggml-parakeet-tdt-0.6b-v3.bin] model path
-f, --file FILE [ ] input audio file
-ng, --no-gpu [false ] disable GPU
-dev N, --device N [0 ] GPU device to use
-fa, --flash-attn [false ] enable flash attention
-nfa, --no-flash-attn [false ] disable flash attention
-ps, --print-segments [false ] print segment information
```
### Example
```console
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav
Processing audio (176000 samples, 11.00 seconds)
Processing audio: total_frames=1101, chunk_size=1101
parakeet_decode: starting decode with n_frames=138
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
```
To print segment information:
```console
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav --print-segments
Processing audio (176000 samples, 11.00 seconds)
Processing audio: total_frames=1101, chunk_size=1101
parakeet_decode: starting decode with n_frames=138
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
Segments (1):
Segment 0: [0 -> 1101] "And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country."
Tokens [38]:
[ 0] id= 1976 frame= 3 dur_idx= 4 dur_val= 4 p=0.9996 plog=-15.6206 t0= 24 t1= 56 word_start=true "▁And"
[ 1] id= 547 frame= 7 dur_idx= 4 dur_val= 4 p=0.9999 plog=-18.7922 t0= 56 t1= 88 word_start=true "▁so"
[ 2] id= 7877 frame= 11 dur_idx= 2 dur_val= 2 p=0.8451 plog=-14.5929 t0= 88 t1= 88 word_start=false ","
[ 3] id= 1103 frame= 13 dur_idx= 3 dur_val= 3 p=0.9996 plog=-15.6127 t0= 104 t1= 128 word_start=true "▁my"
[ 4] id= 309 frame= 16 dur_idx= 1 dur_val= 1 p=0.9912 plog=-11.9635 t0= 128 t1= 136 word_start=true "▁f"
[ 5] id= 530 frame= 17 dur_idx= 2 dur_val= 2 p=1.0000 plog=-13.5239 t0= 136 t1= 152 word_start=false "ell"
[ 6] id= 596 frame= 19 dur_idx= 3 dur_val= 3 p=1.0000 plog=-16.3120 t0= 152 t1= 176 word_start=false "ow"
[ 7] id= 3213 frame= 22 dur_idx= 4 dur_val= 4 p=0.9999 plog=-10.1462 t0= 176 t1= 208 word_start=true "▁Amer"
[ 8] id= 404 frame= 26 dur_idx= 4 dur_val= 4 p=1.0000 plog=-25.0910 t0= 208 t1= 240 word_start=false "ic"
[ 9] id= 667 frame= 30 dur_idx= 4 dur_val= 4 p=1.0000 plog=-27.1707 t0= 240 t1= 272 word_start=false "ans"
[10] id= 7877 frame= 37 dur_idx= 4 dur_val= 4 p=0.9094 plog=-16.3405 t0= 272 t1= 272 word_start=false ","
[11] id= 279 frame= 41 dur_idx= 4 dur_val= 4 p=0.9980 plog=-19.7244 t0= 328 t1= 360 word_start=true "▁a"
[12] id= 583 frame= 45 dur_idx= 4 dur_val= 4 p=1.0000 plog=-24.5312 t0= 360 t1= 392 word_start=false "sk"
[13] id= 1491 frame= 53 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2991 t0= 424 t1= 456 word_start=true "▁not"
[14] id= 3470 frame= 65 dur_idx= 4 dur_val= 4 p=0.9995 plog=-16.7306 t0= 520 t1= 552 word_start=true "▁what"
[15] id= 3629 frame= 69 dur_idx= 2 dur_val= 2 p=0.8139 plog=-11.6486 t0= 552 t1= 568 word_start=true "▁your"
[16] id= 867 frame= 75 dur_idx= 1 dur_val= 1 p=0.9980 plog=-12.5265 t0= 600 t1= 608 word_start=true "▁co"
[17] id= 331 frame= 76 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.6697 t0= 608 t1= 624 word_start=false "un"
[18] id= 958 frame= 78 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.3621 t0= 624 t1= 640 word_start=false "tr"
[19] id= 7893 frame= 80 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.3245 t0= 640 t1= 656 word_start=false "y"
[20] id= 2059 frame= 82 dur_idx= 3 dur_val= 3 p=1.0000 plog=-17.7694 t0= 656 t1= 680 word_start=true "▁can"
[21] id= 458 frame= 85 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2510 t0= 680 t1= 712 word_start=true "▁do"
[22] id= 509 frame= 89 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.0688 t0= 712 t1= 744 word_start=true "▁for"
[23] id= 1180 frame= 93 dur_idx= 4 dur_val= 4 p=0.9999 plog=-25.0567 t0= 744 t1= 776 word_start=true "▁you"
[24] id= 7877 frame= 98 dur_idx= 4 dur_val= 4 p=0.8820 plog=-14.2549 t0= 776 t1= 776 word_start=false ","
[25] id= 279 frame=102 dur_idx= 3 dur_val= 3 p=0.9992 plog=-16.8176 t0= 816 t1= 840 word_start=true "▁a"
[26] id= 583 frame=105 dur_idx= 4 dur_val= 4 p=1.0000 plog=-21.0352 t0= 840 t1= 872 word_start=false "sk"
[27] id= 3470 frame=109 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.4659 t0= 872 t1= 896 word_start=true "▁what"
[28] id= 1180 frame=112 dur_idx= 4 dur_val= 4 p=0.9997 plog=-17.6392 t0= 896 t1= 928 word_start=true "▁you"
[29] id= 2059 frame=116 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.5484 t0= 928 t1= 952 word_start=true "▁can"
[30] id= 458 frame=119 dur_idx= 2 dur_val= 2 p=1.0000 plog=-15.9953 t0= 952 t1= 968 word_start=true "▁do"
[31] id= 509 frame=121 dur_idx= 3 dur_val= 3 p=1.0000 plog=-15.9605 t0= 968 t1= 992 word_start=true "▁for"
[32] id= 3629 frame=124 dur_idx= 2 dur_val= 2 p=0.9994 plog=-12.2083 t0= 992 t1=1008 word_start=true "▁your"
[33] id= 867 frame=126 dur_idx= 2 dur_val= 2 p=0.9969 plog=-9.1252 t0=1008 t1=1024 word_start=true "▁co"
[34] id= 331 frame=128 dur_idx= 1 dur_val= 1 p=0.9999 plog=-12.6911 t0=1024 t1=1032 word_start=false "un"
[35] id= 958 frame=129 dur_idx= 1 dur_val= 1 p=1.0000 plog=-8.8885 t0=1032 t1=1040 word_start=false "tr"
[36] id= 7893 frame=130 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.1441 t0=1040 t1=1056 word_start=false "y"
[37] id= 7883 frame=132 dur_idx= 4 dur_val= 4 p=0.9567 plog=-11.5227 t0=1056 t1=1056 word_start=false "."
```
### Model conversion
Clone the original model from Hugging Face:
```console
$ git clone https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
```
Convert the model:
```console
(venv) $ python models/convert-parakeet-to-ggml.py \
--model <path to cloned model> \
--use-f32 \
--out-dir models \
--out-name ggml-parakeet-tdt-0.6b-v3.bin
```
[Parakeet]: https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3

View File

@ -0,0 +1,220 @@
#include "parakeet.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <cstring>
// command-line parameters
struct parakeet_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t chunk_length_ms = 10000;
int32_t left_context_ms = 10000;
int32_t right_context_ms = 4960;
bool use_gpu = true;
bool flash_attn = true;
int32_t gpu_device = 0;
bool print_segments = false;
std::string model = "models/ggml-parakeet-tdt-0.6b-v3.bin";
std::vector<std::string> fname_inp = {};
};
static void parakeet_print_usage(int argc, char ** argv, const parakeet_params & params);
static char * requires_value_error(const std::string & arg) {
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
exit(1);
}
static bool parakeet_params_parse(int argc, char ** argv, parakeet_params & params) {
if (const char * env_device = std::getenv("PARAKEET_ARG_DEVICE")) {
params.gpu_device = std::stoi(env_device);
}
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-"){
params.fname_inp.push_back(arg);
continue;
}
if (arg[0] != '-') {
params.fname_inp.push_back(arg);
continue;
}
if (arg == "-h" || arg == "--help") {
parakeet_print_usage(argc, argv, params);
exit(0);
}
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
else if (arg == "-cl" || arg == "--chunk-length") { params.chunk_length_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-lc" || arg == "--left-context") { params.left_context_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-rc" || arg == "--right-context") { params.right_context_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-dev" || arg == "--device") { params.gpu_device = std::stoi(ARGV_NEXT); }
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = false; }
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
else if (arg == "-ps" || arg == "--print-segments") { params.print_segments = true; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
parakeet_print_usage(argc, argv, params);
exit(1);
}
}
return true;
}
static void parakeet_print_usage(int /*argc*/, char ** argv, const parakeet_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -cl N, --chunk-length N [%-7d] chunk length in milliseconds\n", params.chunk_length_ms);
fprintf(stderr, " -lc N, --left-context N [%-7d] left context in milliseconds\n", params.left_context_ms);
fprintf(stderr, " -rc N, --right-context N [%-7d] right context in milliseconds\n", params.right_context_ms);
fprintf(stderr, " -m, --model FILE [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f, --file FILE [%-7s] input audio file\n", "");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -dev N, --device N [%-7d] GPU device to use\n", params.gpu_device);
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", !params.flash_attn ? "true" : "false");
fprintf(stderr, " -ps, --print-segments [%-7s] print segment information\n", params.print_segments ? "true" : "false");
fprintf(stderr, "\n");
}
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
static bool is_first = true;
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
char text_buf[256];
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
printf("%s", text_buf);
fflush(stdout);
is_first = false;
}
int main(int argc, char ** argv) {
ggml_backend_load_all();
parakeet_params params;
if (parakeet_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.fname_inp.empty()) {
fprintf(stderr, "error: no input files specified\n");
parakeet_print_usage(argc, argv, params);
return 1;
}
// Process each input file
for (const auto & fname : params.fname_inp) {
fprintf(stderr, "\nProcessing file: %s\n", fname.c_str());
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
if (!read_audio_data(fname.c_str(), pcmf32, pcmf32s, false)) {
fprintf(stderr, "error: failed to read audio file '%s'\n", fname.c_str());
continue;
}
if (pcmf32.empty()) {
fprintf(stderr, "error: no audio data in file '%s'\n", fname.c_str());
continue;
}
fprintf(stderr, "Loading Parakeet model from: %s\n", params.model.c_str());
struct parakeet_context_params ctx_params = parakeet_context_default_params();
ctx_params.use_gpu = params.use_gpu;
ctx_params.flash_attn = params.flash_attn;
ctx_params.gpu_device = params.gpu_device;
struct parakeet_context * pctx = parakeet_init_from_file_with_params(params.model.c_str(), ctx_params);
if (pctx == nullptr) {
fprintf(stderr, "error: failed to load Parakeet model from '%s'\n", params.model.c_str());
return 1;
}
fprintf(stderr, "Successfully loaded Parakeet model\n");
fprintf(stderr, "Processing audio (%zu samples, %.2f seconds)\n",
pcmf32.size(), (float)pcmf32.size() / PARAKEET_SAMPLE_RATE);
struct parakeet_full_params full_params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
full_params.n_threads = params.n_threads;
full_params.chunk_length_ms = params.chunk_length_ms;
full_params.left_context_ms = params.left_context_ms;
full_params.right_context_ms = params.right_context_ms;
full_params.new_token_callback = token_callback;
full_params.new_token_callback_user_data = nullptr;
const int mel_frames = (int)(pcmf32.size() / PARAKEET_HOP_LENGTH);
if (mel_frames <= parakeet_n_audio_ctx(pctx)) {
full_params.chunk_length_ms = 0;
}
int ret = parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());
if (ret != 0) {
fprintf(stderr, "error: failed to process audio file '%s'\n", fname.c_str());
parakeet_free(pctx);
continue;
}
printf("\n");
if (params.print_segments) {
const int n_segments = parakeet_full_n_segments(pctx);
fprintf(stderr, "\nSegments (%d):\n", n_segments);
for (int i = 0; i < n_segments; i++) {
const char * text = parakeet_full_get_segment_text(pctx, i);
const int64_t t0 = parakeet_full_get_segment_t0(pctx, i);
const int64_t t1 = parakeet_full_get_segment_t1(pctx, i);
const int n_tokens = parakeet_full_n_tokens(pctx, i);
fprintf(stderr, "Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
fprintf(stderr, "Tokens [%d]:\n", n_tokens);
for (int j = 0; j < n_tokens; j++) {
parakeet_token_data token_data = parakeet_full_get_token_data(pctx, i, j);
const char * token_str = parakeet_token_to_str(pctx, token_data.id);
fprintf(stderr, " [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%s \"%s\"\n",
j,
token_data.id,
token_data.frame_index,
token_data.duration_idx,
token_data.duration_value,
token_data.p,
token_data.plog,
(long long)token_data.t0,
(long long)token_data.t1,
token_data.is_word_start ? "true": "false",
token_str);
}
}
}
parakeet_free(pctx);
}
return 0;
}

383
include/parakeet.h Normal file
View File

@ -0,0 +1,383 @@
#ifndef PARAKEET_H
#define PARAKEET_H
#include "ggml.h"
#include "ggml-cpu.h"
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __GNUC__
# define PARAKEET_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define PARAKEET_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define PARAKEET_DEPRECATED(func, hint) func
#endif
#ifdef PARAKEET_SHARED
# ifdef _WIN32
# ifdef PARAKEET_BUILD
# define PARAKEET_API __declspec(dllexport)
# else
# define PARAKEET_API __declspec(dllimport)
# endif
# else
# define PARAKEET_API __attribute__ ((visibility ("default")))
# endif
#else
# define PARAKEET_API
#endif
#define PARAKEET_SAMPLE_RATE 16000
#define PARAKEET_HOP_LENGTH 160
#ifdef __cplusplus
extern "C" {
#endif
struct parakeet_context;
struct parakeet_state;
struct parakeet_full_params;
typedef int32_t parakeet_pos;
typedef int32_t parakeet_token;
typedef int32_t parakeet_seq_id;
struct parakeet_context_params {
bool use_gpu;
bool flash_attn;
int gpu_device; // CUDA device
};
typedef struct parakeet_token_data {
parakeet_token id; // the BPE subword ID (0-8191)
int duration_idx; // index into the models durations array
int duration_value; // actual duration value
int frame_index;
float p;
float plog;
int64_t t0;
int64_t t1;
bool is_word_start;
} parakeet_token_data;
typedef struct parakeet_model_loader {
void * context;
size_t (*read)(void * ctx, void * output, size_t read_size);
bool (*eof)(void * ctx);
void (*close)(void * ctx);
} parakeet_model_loader;
PARAKEET_API const char * parakeet_version(void);
// Various functions for loading a ggml parakeet model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
PARAKEET_API struct parakeet_context * parakeet_init_from_file_with_params (const char * path_model, struct parakeet_context_params params);
PARAKEET_API struct parakeet_context * parakeet_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct parakeet_context_params params);
PARAKEET_API struct parakeet_context * parakeet_init_with_params (struct parakeet_model_loader * loader, struct parakeet_context_params params);
// These are the same as the above, but the internal state of the context is not allocated automatically
// It is the responsibility of the caller to allocate the state using parakeet_init_state() (#523)
PARAKEET_API struct parakeet_context * parakeet_init_from_file_with_params_no_state (const char * path_model, struct parakeet_context_params params);
PARAKEET_API struct parakeet_context * parakeet_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct parakeet_context_params params);
PARAKEET_API struct parakeet_context * parakeet_init_with_params_no_state (struct parakeet_model_loader * loader, struct parakeet_context_params params);
PARAKEET_API struct parakeet_state * parakeet_init_state(struct parakeet_context * ctx);
// Frees all allocated memory
PARAKEET_API void parakeet_free (struct parakeet_context * ctx);
PARAKEET_API void parakeet_free_state(struct parakeet_state * state);
PARAKEET_API void parakeet_free_params(struct parakeet_full_params * params);
PARAKEET_API void parakeet_free_context_params(struct parakeet_context_params * params);
// Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the default state of the provided parakeet context.
// Returns 0 on success
PARAKEET_API int parakeet_pcm_to_mel(
struct parakeet_context * ctx,
const float * samples,
int n_samples,
int n_threads);
PARAKEET_API int parakeet_pcm_to_mel_with_state(
struct parakeet_context * ctx,
struct parakeet_state * state,
const float * samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the default state of the provided parakeet context.
// Use this instead of parakeet_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80
// Returns 0 on success
PARAKEET_API int parakeet_set_mel(
struct parakeet_context * ctx,
const float * data,
int n_len,
int n_mel);
PARAKEET_API int parakeet_set_mel_with_state(
struct parakeet_context * ctx,
struct parakeet_state * state,
const float * data,
int n_len,
int n_mel);
// Run the Parakeet encoder on the log mel spectrogram stored inside the default state in the provided parakeet context.
// Make sure to call parakeet_pcm_to_mel() or parakeet_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success
PARAKEET_API int parakeet_encode(
struct parakeet_context * ctx,
int offset,
int n_threads);
PARAKEET_API int parakeet_encode_with_state(
struct parakeet_context * ctx,
struct parakeet_state * state,
int offset,
int n_threads);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
PARAKEET_API int parakeet_tokenize(
struct parakeet_context * ctx,
const char * text,
parakeet_token * tokens,
int n_max_tokens);
// Return the number of tokens in the provided text
// Equivalent to: -parakeet_tokenize(ctx, text, NULL, 0)
int parakeet_token_count(struct parakeet_context * ctx, const char * text);
PARAKEET_API int parakeet_n_len (struct parakeet_context * ctx); // mel length
PARAKEET_API int parakeet_n_len_from_state(struct parakeet_state * state); // mel length
PARAKEET_API int parakeet_n_vocab (struct parakeet_context * ctx);
PARAKEET_API int parakeet_n_audio_ctx (struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_vocab (struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_audio_ctx (struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_audio_state(struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_audio_head (struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_audio_layer(struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_n_mels (struct parakeet_context * ctx);
PARAKEET_API int parakeet_model_ftype (struct parakeet_context * ctx);
// Token logits obtained from the last call to parakeet_full/parakeet_chunk
// The logits for the last token are stored in the last row
// Rows: n_tokens
// Cols: n_vocab
PARAKEET_API float * parakeet_get_logits (struct parakeet_context * ctx);
PARAKEET_API float * parakeet_get_logits_from_state(struct parakeet_state * state);
// Token Id -> String. Uses the vocabulary in the provided context
PARAKEET_API const char * parakeet_token_to_str(struct parakeet_context * ctx, parakeet_token token);
PARAKEET_API int parakeet_token_to_text(const char * token_str, bool is_first, char * output, int max_len);
// Special tokens
PARAKEET_API parakeet_token parakeet_token_blank (struct parakeet_context * ctx);
PARAKEET_API parakeet_token parakeet_token_unk (struct parakeet_context * ctx);
PARAKEET_API parakeet_token parakeet_token_bos(struct parakeet_context * ctx);
// Performance information from the default state.
struct parakeet_timings {
float sample_ms;
float encode_ms;
float decode_ms;
float batchd_ms;
float prompt_ms;
};
PARAKEET_API struct parakeet_timings * parakeet_get_timings(struct parakeet_context * ctx);
PARAKEET_API void parakeet_print_timings(struct parakeet_context * ctx);
PARAKEET_API void parakeet_reset_timings(struct parakeet_context * ctx);
// Print system information
PARAKEET_API const char * parakeet_print_system_info(void);
// Available sampling strategies
enum parakeet_sampling_strategy {
PARAKEET_SAMPLING_GREEDY,
};
// Token callback.
// Called for each new predicted token.
// Use the parakeet_full_...() functions to obtain the text segments
typedef void (*parakeet_new_token_callback)(
struct parakeet_context * ctx,
struct parakeet_state * state,
const parakeet_token_data * token_data,
void * user_data);
// Text segment callback
// Called on every newly generated text segment
// Use the parakeet_full_...() functions to obtain the text segments
typedef void (*parakeet_new_segment_callback)(struct parakeet_context * ctx, struct parakeet_state * state, int n_new, void * user_data);
// Progress callback
typedef void (*parakeet_progress_callback)(struct parakeet_context * ctx, struct parakeet_state * state, int progress, void * user_data);
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
typedef bool (*parakeet_encoder_begin_callback)(struct parakeet_context * ctx, struct parakeet_state * state, void * user_data);
// Parameters for the parakeet_full() function
// If you change the order or add new parameters, make sure to update the default values in parakeet.cpp:
// parakeet_full_default_params()
struct parakeet_full_params {
enum parakeet_sampling_strategy strategy;
int n_threads;
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
bool no_context; // do not use past transcription (if any) as context
// [EXPERIMENTAL] speed-up techniques
int audio_ctx; // overwrite the audio context size (0 = use default)
int chunk_length_ms; // length of each chunk in ms
int left_context_ms; // left context in ms
int right_context_ms; // right context in ms
// called for every newly generated text segment
parakeet_new_segment_callback new_segment_callback;
void * new_segment_callback_user_data;
// called for every newly generated token
parakeet_new_token_callback new_token_callback;
void * new_token_callback_user_data;
// called on each progress update
parakeet_progress_callback progress_callback;
void * progress_callback_user_data;
// called each time before the encoder starts
parakeet_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data;
// called each time before ggml computation starts
ggml_abort_callback abort_callback;
void * abort_callback_user_data;
};
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see parakeet_free_context_params() & parakeet_free_params()
PARAKEET_API struct parakeet_context_params * parakeet_context_default_params_by_ref(void);
PARAKEET_API struct parakeet_context_params parakeet_context_default_params (void);
PARAKEET_API struct parakeet_full_params * parakeet_full_default_params_by_ref(enum parakeet_sampling_strategy strategy);
PARAKEET_API struct parakeet_full_params parakeet_full_default_params (enum parakeet_sampling_strategy strategy);
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Not thread safe for same context
PARAKEET_API int parakeet_full(
struct parakeet_context * ctx,
struct parakeet_full_params params,
const float * samples,
int n_samples);
PARAKEET_API int parakeet_full_with_state(
struct parakeet_context * ctx,
struct parakeet_state * state,
struct parakeet_full_params params,
const float * samples,
int n_samples);
// Split the input audio in chunks and process each chunk separately using parakeet_full_with_state()
// Result is stored in the default state of the context
// Not thread safe if executed in parallel on the same context.
PARAKEET_API int parakeet_full_parallel(
struct parakeet_context * ctx,
struct parakeet_full_params params,
const float * samples,
int n_samples,
int n_processors);
// Process a single chunk of audio data that fits within the model's audio context window.
// This is more efficient than parakeet_full() for short audio clips.
PARAKEET_API int parakeet_chunk(
struct parakeet_context * ctx,
struct parakeet_state * state,
struct parakeet_full_params params,
const float * samples,
int n_samples);
// Initialize streaming state for a new stream.
PARAKEET_API int parakeet_stream_init(
struct parakeet_context * ctx,
struct parakeet_state * state,
struct parakeet_full_params params);
// Push audio samples in streaming mode. Internally this function will structure
// the samples in a buffer where with a left context, a center chunk, and a
// right context. The encoder will see the complete buffer which enables it
// to get boundry context for the target/center audio chunk. This avoids hard
// cut offs at the chunk boundaries. The joint network then only sees the
// center chunk and this function internally handles the context windowing.
PARAKEET_API int parakeet_stream_push(
struct parakeet_context * ctx,
struct parakeet_state * state,
const float * samples,
int n_samples);
// Flush the final partial chunk at end-of-stream.
PARAKEET_API int parakeet_stream_flush(
struct parakeet_context * ctx,
struct parakeet_state * state);
// Number of generated text segments
PARAKEET_API int parakeet_full_n_segments (struct parakeet_context * ctx);
PARAKEET_API int parakeet_full_n_segments_from_state(struct parakeet_state * state);
// Get the start and end time of the specified segment
PARAKEET_API int64_t parakeet_full_get_segment_t0 (struct parakeet_context * ctx, int i_segment);
PARAKEET_API int64_t parakeet_full_get_segment_t0_from_state(struct parakeet_state * state, int i_segment);
PARAKEET_API int64_t parakeet_full_get_segment_t1 (struct parakeet_context * ctx, int i_segment);
PARAKEET_API int64_t parakeet_full_get_segment_t1_from_state(struct parakeet_state * state, int i_segment);
// Get the text of the specified segment
PARAKEET_API const char * parakeet_full_get_segment_text (struct parakeet_context * ctx, int i_segment);
PARAKEET_API const char * parakeet_full_get_segment_text_from_state(struct parakeet_state * state, int i_segment);
// Get number of tokens in the specified segment
PARAKEET_API int parakeet_full_n_tokens (struct parakeet_context * ctx, int i_segment);
PARAKEET_API int parakeet_full_n_tokens_from_state(struct parakeet_state * state, int i_segment);
// Get the token text of the specified token in the specified segment
PARAKEET_API const char * parakeet_full_get_token_text (struct parakeet_context * ctx, int i_segment, int i_token);
PARAKEET_API const char * parakeet_full_get_token_text_from_state(struct parakeet_context * ctx, struct parakeet_state * state, int i_segment, int i_token);
// Get the token id of the specified token in the specified segment
PARAKEET_API parakeet_token parakeet_full_get_token_id (struct parakeet_context * ctx, int i_segment, int i_token);
PARAKEET_API parakeet_token parakeet_full_get_token_id_from_state(struct parakeet_state * state, int i_segment, int i_token);
// Get token data for the specified token in the specified segment
PARAKEET_API parakeet_token_data parakeet_full_get_token_data (struct parakeet_context * ctx, int i_segment, int i_token);
PARAKEET_API parakeet_token_data parakeet_full_get_token_data_from_state(struct parakeet_state * state, int i_segment, int i_token);
// Get the probability of the specified token in the specified segment
PARAKEET_API float parakeet_full_get_token_p (struct parakeet_context * ctx, int i_segment, int i_token);
PARAKEET_API float parakeet_full_get_token_p_from_state(struct parakeet_state * state, int i_segment, int i_token);
// Control logging output; default behavior is to print to stderr
PARAKEET_API void parakeet_log_set(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,331 @@
#!/usr/bin/env python3
# Convert Parakeet TDT model from NeMo format to ggml format
#
# Usage: python convert-parakeet-to-ggml.py --model parakeet-model.nemo --output-dir output-dir [--use-f32]
#
# The NeMo file is a tar archive containing:
# - model_weights.ckpt (PyTorch checkpoint)
# - model_config.yaml (model configuration)
# - tokenizer files
#
# This script extracts the NeMo archive, loads the model weights and configuration,
# and saves them in ggml format compatible with whisper.cpp.
#
import torch
import argparse
import io
import os
import sys
import struct
import tarfile
import tempfile
import shutil
import yaml
import numpy as np
from pathlib import Path
from typing import Optional
def hz_to_mel(freq):
return 2595.0 * np.log10(1.0 + freq / 700.0)
def mel_to_hz(mel):
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
def create_relative_positional_encoding(d_model: int, n_pos_max_len: int) -> np.ndarray:
max_len = n_pos_max_len * 2 - 1
log_10000 = np.log(10000.0)
pe = np.zeros((max_len, d_model), dtype=np.float32)
for idx in range(max_len):
position = float((max_len // 2) - idx)
for i in range(0, d_model, 2):
div_term = np.exp(-float(i) * log_10000 / float(d_model))
angle = position * div_term
pe[idx, i] = np.sin(angle)
pe[idx, i + 1] = np.cos(angle)
return pe
def extract_nemo_archive(nemo_path, extract_dir):
print(f"Extracting {nemo_path} to {extract_dir}")
with tarfile.open(nemo_path, 'r') as tar:
tar.extractall(path=extract_dir)
print("Extraction complete")
def load_model_config(config_path):
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
return config
def load_tokenizer(extract_dir, config):
tokenizer_model_path = None
tokenizer_vocab_path = None
for file in os.listdir(extract_dir):
if file.endswith('_tokenizer.model'):
tokenizer_model_path = os.path.join(extract_dir, file)
elif file.endswith('tokenizer.vocab'):
tokenizer_vocab_path = os.path.join(extract_dir, file)
if not tokenizer_model_path:
raise FileNotFoundError("Tokenizer model file not found")
if not tokenizer_vocab_path:
raise FileNotFoundError("Tokenizer vocab file not found")
tokens = {}
with open(tokenizer_vocab_path, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
parts = line.strip().split('\t')
if len(parts) >= 1:
token = parts[0]
tokens[token.encode('utf-8')] = idx
print(f"Loaded {len(tokens)} tokens from {os.path.basename(tokenizer_vocab_path)}")
if len(tokens) != 8192:
print(f"WARNING: Expected 8192 tokens, got {len(tokens)}")
return tokens
def write_tensor(fout, name, data, use_f16=True, force_f32=False):
if 'pre_encode.conv' in name and 'bias' in name and len(data.shape) == 1:
data = data.reshape(1, -1, 1, 1)
print(f" Reshaped conv bias {name} to {data.shape}")
n_dims = len(data.shape)
ftype = 1 if use_f16 and not force_f32 else 0
if force_f32:
data = data.astype(np.float32)
elif use_f16:
if n_dims < 2 or 'bias' in name or 'norm' in name:
data = data.astype(np.float32)
ftype = 0
else:
data = data.astype(np.float16)
else:
data = data.astype(np.float32)
dims_reversed = [data.shape[n_dims - 1 - i] for i in range(n_dims)]
print(f"Processing: {name} {list(data.shape)}, dtype: {data.dtype}, n_dims: {n_dims}, reversed: {dims_reversed}")
name_bytes = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(name_bytes), ftype))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(name_bytes)
data.tofile(fout)
def convert_parakeet_to_ggml(nemo_path, output_dir, use_f16=True, out_name=None):
nemo_path = Path(nemo_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Create temporary directory for extraction
with tempfile.TemporaryDirectory() as temp_dir:
extract_nemo_archive(nemo_path, temp_dir)
config_path = os.path.join(temp_dir, 'model_config.yaml')
config = load_model_config(config_path)
print("Model configuration:")
print(f" Sample rate: {config['sample_rate']}")
print(f" Encoder layers: {config['encoder']['n_layers']}")
print(f" Encoder d_model: {config['encoder']['d_model']}")
print(f" Mel features: {config['preprocessor']['features']}")
weights_path = os.path.join(temp_dir, 'model_weights.ckpt')
print(f"\nLoading model weights from {weights_path}")
checkpoint = torch.load(weights_path, map_location='cpu')
# Extract state dict
if 'state_dict' in checkpoint:
state_dict = checkpoint['state_dict']
else:
state_dict = checkpoint
print(f"Loaded {len(state_dict)} tensors")
# Load tokenizer
print("\nLoading tokenizer...")
tokens = load_tokenizer(temp_dir, config)
print(f"Loaded {len(tokens)} tokens")
# Prepare hyperparameters for the Parakeet ggml format.
hparams = {
'n_audio_ctx': 5000,
'n_audio_state': config['encoder']['d_model'],
'n_audio_head': config['encoder']['n_heads'],
'n_audio_layer': config['encoder']['n_layers'],
'n_mels': config['preprocessor']['features'],
'n_fft': config['preprocessor']['n_fft'],
'subsampling_factor': config['encoder']['subsampling_factor'],
'n_subsampling_channels': config['encoder']['subsampling_conv_channels'],
'n_pos_max_len': config['encoder']['pos_emb_max_len'],
'n_pred_dim': config['decoder']['prednet']['pred_hidden'],
'n_pred_layers': config['decoder']['prednet']['pred_rnn_layers'],
'n_vocab': config['decoder']['vocab_size'],
'n_tdt_durations': config['model_defaults']['num_tdt_durations'],
'n_max_tokens': config['decoding']['greedy']['max_symbols'],
}
print("\nGGML hyperparameters:")
for key, value in hparams.items():
print(f" {key}: {value}")
pe = create_relative_positional_encoding(hparams['n_audio_state'], hparams['n_pos_max_len'])
print(f"\nGenerated positional encoding tensor 'encoder.pe' with shape {pe.shape}")
# Create output file
if out_name:
fname_out = output_dir / out_name
else:
fname_out = output_dir / ("ggml-model-f32.bin" if not use_f16 else "ggml-model.bin")
print(f"\nWriting to {fname_out}")
with open(fname_out, 'wb') as fout:
# Write magic number
fout.write(struct.pack("i", 0x67676d6c)) # 'ggml' in hex
# Write hyperparameters
fout.write(struct.pack("i", hparams['n_vocab']))
fout.write(struct.pack("i", hparams['n_audio_ctx']))
fout.write(struct.pack("i", hparams['n_audio_state']))
fout.write(struct.pack("i", hparams['n_audio_head']))
fout.write(struct.pack("i", hparams['n_audio_layer']))
fout.write(struct.pack("i", hparams['n_mels']))
fout.write(struct.pack("i", 1 if use_f16 else 0))
fout.write(struct.pack("i", hparams['n_fft']))
fout.write(struct.pack("i", hparams['subsampling_factor']))
fout.write(struct.pack("i", hparams['n_subsampling_channels']))
fout.write(struct.pack("i", hparams['n_pos_max_len']))
fout.write(struct.pack("i", hparams['n_pred_dim']))
fout.write(struct.pack("i", hparams['n_pred_layers']))
fout.write(struct.pack("i", hparams['n_tdt_durations']))
fout.write(struct.pack("i", hparams['n_max_tokens']))
# Extract mel filterbank from model
fb_key = None
for key in state_dict.keys():
if 'featurizer.fb' in key or 'filterbank' in key.lower():
fb_key = key
break
if not fb_key:
print("\nERROR: Mel filterbank not found in model!")
print("Expected tensor with 'featurizer.fb' or 'filterbank' in name")
print("\nAvailable preprocessor tensors:")
for key in sorted(state_dict.keys()):
if 'preprocessor' in key or 'featurizer' in key:
print(f" {key}: {state_dict[key].shape}")
raise ValueError("Mel filterbank tensor not found in model")
print(f"\nUsing model's mel filterbank from: {fb_key}")
mel_filters = state_dict[fb_key].squeeze().numpy().astype(np.float32)
print(f" Filterbank shape: {mel_filters.shape}")
print(f" Filterbank min/max values: {mel_filters.min():.6f} / {mel_filters.max():.6f}")
print(f" Filterbank non-zero elements: {np.count_nonzero(mel_filters)} / {mel_filters.size}")
print(f" First row sum: {mel_filters[0].sum():.6f}")
if len(mel_filters.shape) != 2:
raise ValueError(f"Expected 2D filterbank, got shape {mel_filters.shape}")
n_mels, n_freqs = mel_filters.shape
fout.write(struct.pack("i", n_mels)) # n_mel
fout.write(struct.pack("i", n_freqs)) # n_fb (frequency bins)
# Write mel filterbank
for i in range(n_mels):
for j in range(n_freqs):
fout.write(struct.pack("f", mel_filters[i, j]))
# Extract window function from model
window_key = None
for key in state_dict.keys():
if 'featurizer.window' in key or 'preproc' in key and 'window' in key:
window_key = key
break
if not window_key:
print("\nERROR: Window function not found in model!")
print("Expected tensor with 'featurizer.window' in name")
raise ValueError("Window function tensor not found in model")
print(f"\nUsing model's window function from: {window_key}")
window = state_dict[window_key].squeeze().numpy().astype(np.float32)
print(f" Window shape: {window.shape}")
print(f" Window min/max values: {window.min():.6f} / {window.max():.6f}")
print(f" Window non-zero elements: {np.count_nonzero(window)} / {window.size}")
print(f" Window sum: {window.sum():.6f}")
if len(window.shape) != 1:
raise ValueError(f"Expected 1D window, got shape {window.shape}")
n_window = window.shape[0]
fout.write(struct.pack("i", n_window))
# Write window function
for i in range(n_window):
fout.write(struct.pack("f", window[i]))
# Write TDT durations
tdt_durations = config['model_defaults']['tdt_durations']
if len(tdt_durations) != hparams['n_tdt_durations']:
raise ValueError(f"TDT durations count mismatch: {len(tdt_durations)} vs {hparams['n_tdt_durations']}")
for duration in tdt_durations:
fout.write(struct.pack("I", duration))
fout.write(struct.pack("i", len(tokens)))
for token_bytes, idx in sorted(tokens.items(), key=lambda x: x[1]):
fout.write(struct.pack("i", len(token_bytes)))
fout.write(token_bytes)
print("\nConverting model weights...")
for name, tensor in state_dict.items():
# Skip the filterbank and window - already written in preprocessing section
if name == fb_key:
continue
if name == window_key:
continue
# Don't squeeze Conv2d weights - they need to preserve all 4 dimensions
if 'conv' in name and 'weight' in name and len(tensor.shape) == 4:
data = tensor.numpy()
else:
data = tensor.squeeze().numpy()
write_tensor(fout, name, data, use_f16=use_f16)
write_tensor(fout, "encoder.pe", pe, use_f16=use_f16, force_f32=True)
print(f"\nConversion complete!")
print(f"Output file: {fname_out}")
print(f"File size: {fname_out.stat().st_size / (1024**2):.2f} MB")
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Convert Parakeet TDT model from NeMo format to ggml format'
)
parser.add_argument('--model', type=str, required=True,
help='Path to Parakeet .nemo model file')
parser.add_argument('--out-dir', type=str, required=True,
help='Directory to write ggml model file')
parser.add_argument('--use-f32', action='store_true', default=False,
help='Use f32 instead of f16 (default: f16)')
parser.add_argument('--out-name', type=str, default=None,
help='Output file name (default: ggml-model.bin or ggml-model-f32.bin)')
args = parser.parse_args()
if not os.path.exists(args.model):
print(f"Error: {args.model} not found")
sys.exit(1)
use_f16 = not args.use_f32
convert_parakeet_to_ggml(args.model, args.out_dir, use_f16, args.out_name)

View File

@ -0,0 +1 @@
pyyaml

View File

@ -0,0 +1,75 @@
import os
from huggingface_hub import HfApi, create_repo
# TODO: change to ggml-org once merged.
USER_NAME = "danbev"
REPO_ID = f"{USER_NAME}/parakeet"
LOCAL_GGUF_PATH = "models/ggml-parakeet-tdt-0.6b-v3.bin"
REMOTE_GGUF_NAME = "parakeet-tdt-0.6b-v3.bin"
MODEL_CARD_CONTENT = f"""---
license: apache-2.0
base_model: {USER_NAME}/parakeet
tags:
- gguf
---
# Parakeet Model Card
## Description
This is an iterative release of the Parakeet model in whisper.cpp format.
## Usage
You can use this file with [parakeet-cli](https://github.com/danbev/whisper.cpp/tree/parakeet-support/examples/parakeet-cli).
Build parakeet-cli:
```console
$ git clone -b parakeet-support https://github.com/danbev/whisper.cpp.git
$ cd whisper.cpp
$ cmake -B build -S .
$ cmake --build build --target parakeet-cli -j 12
```
Download the model:
```console
$ hf download danbev/parakeet parakeet-tdt-0.6b-v3.bin --local-dir models
```
Run:
```console
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav
```
"""
api = HfApi()
def deploy_iteration():
create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
print("Updating Model Card...")
api.upload_file(
path_or_fileobj=MODEL_CARD_CONTENT.encode(),
path_in_repo="README.md",
repo_id=REPO_ID,
repo_type="model",
commit_message="Update README.md"
)
print(f"Uploading {REMOTE_GGUF_NAME}...")
api.upload_file(
path_or_fileobj=LOCAL_GGUF_PATH,
path_in_repo=REMOTE_GGUF_NAME,
repo_id=REPO_ID,
repo_type="model",
commit_message="Upload new parakeet iteration"
)
print(f"\nDeployment successful!")
print(f"URL: https://huggingface.co/{REPO_ID}")
if __name__ == "__main__":
if os.path.exists(LOCAL_GGUF_PATH):
deploy_iteration()
else:
print(f"Error: {LOCAL_GGUF_PATH} not found.")

View File

@ -109,23 +109,43 @@ add_library(whisper
whisper.cpp
)
add_library(parakeet
../include/parakeet.h
parakeet-arch.h
parakeet.cpp
)
target_include_directories(parakeet PUBLIC . ../include)
target_compile_features (parakeet PUBLIC cxx_std_11)
target_link_libraries(parakeet PUBLIC ggml Threads::Threads)
# Set the version numbers
set_target_properties(whisper PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${SOVERSION}
)
set_target_properties(parakeet PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${SOVERSION}
)
target_include_directories(whisper PUBLIC . ../include)
target_compile_features (whisper PUBLIC cxx_std_11) # don't bump
if (CMAKE_CXX_BYTE_ORDER STREQUAL "BIG_ENDIAN")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_BIG_ENDIAN)
set(PARAKEET_EXTRA_FLAGS ${PARAKEET_EXTRA_FLAGS} -DPARAKEET_BIG_ENDIAN)
endif()
if (WHISPER_EXTRA_FLAGS)
target_compile_options(whisper PRIVATE ${WHISPER_EXTRA_FLAGS})
endif()
if (PARAKEET_EXTRA_FLAGS)
target_compile_options(parakeet PRIVATE ${PARAKEET_EXTRA_FLAGS})
endif()
find_package(Threads REQUIRED)
target_link_libraries(whisper PUBLIC ggml Threads::Threads)
@ -144,4 +164,7 @@ endif()
if (BUILD_SHARED_LIBS)
set_target_properties(whisper PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(whisper PRIVATE WHISPER_SHARED WHISPER_BUILD)
set_target_properties(parakeet PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(parakeet PRIVATE PARAKEET_SHARED PARAKEET_BUILD)
endif()

194
src/parakeet-arch.h Normal file
View File

@ -0,0 +1,194 @@
#pragma once
#include "ggml.h"
#include <map>
enum parakeet_tensor {
// Encoder pre_encode
PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_OUT_BIAS,
PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS,
PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS,
PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS,
PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS,
PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT,
PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS,
PARAKEET_TENSOR_ENC_PE,
// Encoder layers (per-layer)
PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_FF1_BIAS,
PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT,
PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_CONV_BIAS,
PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT,
PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT,
PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT,
PARAKEET_TENSOR_ENC_CONV_BN_BIAS,
PARAKEET_TENSOR_ENC_CONV_BN_MEAN,
PARAKEET_TENSOR_ENC_CONV_BN_VAR,
PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES,
PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS,
PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U,
PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V,
PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT,
PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT,
PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT,
PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT,
PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_FF2_BIAS,
PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT,
PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT,
PARAKEET_TENSOR_ENC_NORM_OUT_BIAS,
// Prediction network
PARAKEET_TENSOR_PRED_EMBED_WEIGHT,
PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH,
PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH,
PARAKEET_TENSOR_PRED_LSTM_BIAS_IH,
PARAKEET_TENSOR_PRED_LSTM_BIAS_HH,
// Joint network
PARAKEET_TENSOR_JOINT_PRED_WEIGHT,
PARAKEET_TENSOR_JOINT_PRED_BIAS,
PARAKEET_TENSOR_JOINT_ENC_WEIGHT,
PARAKEET_TENSOR_JOINT_ENC_BIAS,
PARAKEET_TENSOR_JOINT_NET_WEIGHT,
PARAKEET_TENSOR_JOINT_NET_BIAS,
};
static const std::map<parakeet_tensor, const char *> PARAKEET_TENSOR_NAMES = {
// Encoder pre_encode
{PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT, "encoder.pre_encode.out.weight"},
{PARAKEET_TENSOR_ENC_PRE_OUT_BIAS, "encoder.pre_encode.out.bias"},
{PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT, "encoder.pre_encode.conv.0.weight"},
{PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS, "encoder.pre_encode.conv.0.bias"},
{PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT, "encoder.pre_encode.conv.2.weight"},
{PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS, "encoder.pre_encode.conv.2.bias"},
{PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT, "encoder.pre_encode.conv.3.weight"},
{PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS, "encoder.pre_encode.conv.3.bias"},
{PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT, "encoder.pre_encode.conv.5.weight"},
{PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS, "encoder.pre_encode.conv.5.bias"},
{PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT, "encoder.pre_encode.conv.6.weight"},
{PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS, "encoder.pre_encode.conv.6.bias"},
{PARAKEET_TENSOR_ENC_PE, "encoder.pe"},
// Encoder layers (use %d for layer number)
{PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT, "encoder.layers.%d.norm_feed_forward1.weight"},
{PARAKEET_TENSOR_ENC_NORM_FF1_BIAS, "encoder.layers.%d.norm_feed_forward1.bias"},
{PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT, "encoder.layers.%d.feed_forward1.linear1.weight"},
{PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT, "encoder.layers.%d.feed_forward1.linear2.weight"},
{PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT, "encoder.layers.%d.norm_conv.weight"},
{PARAKEET_TENSOR_ENC_NORM_CONV_BIAS, "encoder.layers.%d.norm_conv.bias"},
{PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT, "encoder.layers.%d.conv.pointwise_conv1.weight"},
{PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT, "encoder.layers.%d.conv.depthwise_conv.weight"},
{PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT, "encoder.layers.%d.conv.batch_norm.weight"},
{PARAKEET_TENSOR_ENC_CONV_BN_BIAS, "encoder.layers.%d.conv.batch_norm.bias"},
{PARAKEET_TENSOR_ENC_CONV_BN_MEAN, "encoder.layers.%d.conv.batch_norm.running_mean"},
{PARAKEET_TENSOR_ENC_CONV_BN_VAR, "encoder.layers.%d.conv.batch_norm.running_var"},
{PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES, "encoder.layers.%d.conv.batch_norm.num_batches_tracked"},
{PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT, "encoder.layers.%d.conv.pointwise_conv2.weight"},
{PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT, "encoder.layers.%d.norm_self_att.weight"},
{PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS, "encoder.layers.%d.norm_self_att.bias"},
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U, "encoder.layers.%d.self_attn.pos_bias_u"},
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V, "encoder.layers.%d.self_attn.pos_bias_v"},
{PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT, "encoder.layers.%d.self_attn.linear_q.weight"},
{PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT, "encoder.layers.%d.self_attn.linear_k.weight"},
{PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT, "encoder.layers.%d.self_attn.linear_v.weight"},
{PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT, "encoder.layers.%d.self_attn.linear_out.weight"},
{PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT, "encoder.layers.%d.self_attn.linear_pos.weight"},
{PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT, "encoder.layers.%d.norm_feed_forward2.weight"},
{PARAKEET_TENSOR_ENC_NORM_FF2_BIAS, "encoder.layers.%d.norm_feed_forward2.bias"},
{PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT, "encoder.layers.%d.feed_forward2.linear1.weight"},
{PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT, "encoder.layers.%d.feed_forward2.linear2.weight"},
{PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT, "encoder.layers.%d.norm_out.weight"},
{PARAKEET_TENSOR_ENC_NORM_OUT_BIAS, "encoder.layers.%d.norm_out.bias"},
// Prediction network
{PARAKEET_TENSOR_PRED_EMBED_WEIGHT, "decoder.prediction.embed.weight"},
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH, "decoder.prediction.dec_rnn.lstm.weight_ih_l%d"},
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH, "decoder.prediction.dec_rnn.lstm.weight_hh_l%d"},
{PARAKEET_TENSOR_PRED_LSTM_BIAS_IH, "decoder.prediction.dec_rnn.lstm.bias_ih_l%d"},
{PARAKEET_TENSOR_PRED_LSTM_BIAS_HH, "decoder.prediction.dec_rnn.lstm.bias_hh_l%d"},
// Joint network
{PARAKEET_TENSOR_JOINT_PRED_WEIGHT, "joint.pred.weight"},
{PARAKEET_TENSOR_JOINT_PRED_BIAS, "joint.pred.bias"},
{PARAKEET_TENSOR_JOINT_ENC_WEIGHT, "joint.enc.weight"},
{PARAKEET_TENSOR_JOINT_ENC_BIAS, "joint.enc.bias"},
{PARAKEET_TENSOR_JOINT_NET_WEIGHT, "joint.joint_net.2.weight"},
{PARAKEET_TENSOR_JOINT_NET_BIAS, "joint.joint_net.2.bias"},
};
static const std::map<parakeet_tensor, ggml_op> PARAKEET_TENSOR_INFO = {
// Encoder pre_encode
{PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_PRE_OUT_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_PE, GGML_OP_ADD},
// Encoder layers
{PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_NORM_FF1_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_NORM_CONV_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_CONV_BN_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_CONV_BN_MEAN, GGML_OP_SUB},
{PARAKEET_TENSOR_ENC_CONV_BN_VAR, GGML_OP_DIV},
{PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES, GGML_OP_NONE},
{PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT, GGML_OP_IM2COL},
{PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_NORM_FF2_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT, GGML_OP_MUL},
{PARAKEET_TENSOR_ENC_NORM_OUT_BIAS, GGML_OP_ADD},
// Prediction network
{PARAKEET_TENSOR_PRED_EMBED_WEIGHT, GGML_OP_GET_ROWS},
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_PRED_LSTM_BIAS_IH, GGML_OP_ADD},
{PARAKEET_TENSOR_PRED_LSTM_BIAS_HH, GGML_OP_ADD},
// Joint network
{PARAKEET_TENSOR_JOINT_PRED_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_JOINT_PRED_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_JOINT_ENC_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_JOINT_ENC_BIAS, GGML_OP_ADD},
{PARAKEET_TENSOR_JOINT_NET_WEIGHT, GGML_OP_MUL_MAT},
{PARAKEET_TENSOR_JOINT_NET_BIAS, GGML_OP_ADD},
};

4282
src/parakeet.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -110,3 +110,32 @@ target_compile_definitions(${VAD_TEST} PRIVATE
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/jfk.wav")
add_test(NAME ${VAD_TEST} COMMAND ${VAD_TEST})
set_tests_properties(${VAD_TEST} PROPERTIES LABELS "base;en")
# Parakeet model loading test
set(PARAKEET_TEST test-parakeet)
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
target_compile_definitions(${PARAKEET_TEST} PRIVATE
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/jfk.wav")
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
set(PARAKEET_TEST test-parakeet-stream)
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
target_compile_definitions(${PARAKEET_TEST} PRIVATE
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/gb1.wav")
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
set(PARAKEET_TEST test-parakeet-full)
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
target_compile_definitions(${PARAKEET_TEST} PRIVATE
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/gb1.wav")
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
set_tests_properties(${PARAKEET_TEST} PROPERTIES LABELS "parakeet;unit")

View File

@ -0,0 +1,62 @@
#include "parakeet.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
static bool is_first = true;
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
char text_buf[256];
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
int32_t time_ms = token_data->frame_index * 10;
printf("%s", text_buf);
fflush(stdout);
is_first = false;
}
int main() {
std::string model_path = PARAKEET_MODEL_PATH;
std::string sample_path = SAMPLE_PATH;
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
assert(pcmf32.size() > 0);
assert(pcmf32s.size() == 0); // no stereo vector
printf("Loading Parakeet model from: %s\n", model_path.c_str());
struct parakeet_context_params ctx_params = parakeet_context_default_params();
struct parakeet_context * pctx = parakeet_init_from_file_with_params(model_path.c_str(), ctx_params);
if (pctx == nullptr) {
fprintf(stderr, "Failed to load Parakeet model\n");
return 1;
}
printf("Successfully loaded Parakeet model\n");
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
params.new_token_callback = token_callback;
params.new_token_callback_user_data = nullptr;
params.chunk_length_ms = 10000;
params.left_context_ms = 10000;
params.right_context_ms = 4960;
int ret = parakeet_full(pctx, params, pcmf32.data(), pcmf32.size());
assert(ret == 0);
parakeet_free(pctx);
printf("\nTest passed: parakeet_full_parallel succeeded!\n");
return 0;
}

View File

@ -0,0 +1,107 @@
#include "parakeet.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
static bool is_first = true;
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
char text_buf[256];
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
int32_t time_ms = token_data->frame_index * 10;
printf("%s", text_buf);
fflush(stdout);
is_first = false;
}
void segment_callback(parakeet_context * ctx, parakeet_state * state, int n_new, void * user_data) {
const int n_segments = parakeet_full_n_segments_from_state(state);
const int s0 = n_segments - n_new;
printf("\nSegment Callback: %d new segment(s)\n", n_new);
for (int i = s0; i < n_segments; i++) {
const char * text = parakeet_full_get_segment_text_from_state(state, i);
const int64_t t0 = parakeet_full_get_segment_t0_from_state(state, i);
const int64_t t1 = parakeet_full_get_segment_t1_from_state(state, i);
printf("Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
printf("Tokens:\n");
const int n_tokens = parakeet_full_n_tokens_from_state(state, i);
for (int j = 0; j < n_tokens; j++) {
parakeet_token_data token_data = parakeet_full_get_token_data_from_state(state, i, j);
const char * token_str = parakeet_token_to_str(ctx, token_data.id);
printf(" [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%d \"%s\"\n",
j,
token_data.id,
token_data.frame_index,
token_data.duration_idx,
token_data.duration_value,
token_data.p,
token_data.plog,
(long long)token_data.t0,
(long long)token_data.t1,
token_data.is_word_start,
token_str);
}
}
printf("\n");
}
int main() {
std::string model_path = PARAKEET_MODEL_PATH;
std::string sample_path = SAMPLE_PATH;
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
assert(pcmf32.size() > 0);
struct parakeet_context_params ctx_params = parakeet_context_default_params();
struct parakeet_context * pctx = parakeet_init_from_file_with_params_no_state(model_path.c_str(), ctx_params);
if (pctx == nullptr) { return 1; }
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
params.new_token_callback = token_callback;
params.left_context_ms = 10000;
params.chunk_length_ms = 10000;
params.right_context_ms = 4960;
parakeet_state * state = parakeet_init_state(pctx);
// initialize streaming state
assert(parakeet_stream_init(pctx, state, params) == 0);
const int samples_batch_size = 1600;
int position = 0;
while (position < (int)pcmf32.size()) {
int samples_to_push = std::min(samples_batch_size, (int)pcmf32.size() - position);
int ret = parakeet_stream_push(pctx, state, pcmf32.data() + position, samples_to_push);
assert(ret == 0);
position += samples_to_push;
}
// flush remaining samples.
assert(parakeet_stream_flush(pctx, state) == 0);
parakeet_free_state(state);
parakeet_free(pctx);
printf("\n\nTest passed: Streaming logic.\n");
return 0;
}

99
tests/test-parakeet.cpp Normal file
View File

@ -0,0 +1,99 @@
#include "parakeet.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
static bool is_first = true;
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
char text_buf[256];
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
int32_t time_ms = token_data->frame_index * 10;
printf("%s", text_buf);
fflush(stdout);
is_first = false;
}
void segment_callback(parakeet_context * ctx, parakeet_state * state, int n_new, void * user_data) {
const int n_segments = parakeet_full_n_segments_from_state(state);
const int s0 = n_segments - n_new;
printf("\nSegment Callback: %d new segment(s)\n", n_new);
for (int i = s0; i < n_segments; i++) {
const char * text = parakeet_full_get_segment_text_from_state(state, i);
const int64_t t0 = parakeet_full_get_segment_t0_from_state(state, i);
const int64_t t1 = parakeet_full_get_segment_t1_from_state(state, i);
printf("Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
printf("Tokens:\n");
const int n_tokens = parakeet_full_n_tokens_from_state(state, i);
for (int j = 0; j < n_tokens; j++) {
parakeet_token_data token_data = parakeet_full_get_token_data_from_state(state, i, j);
const char * token_str = parakeet_token_to_str(ctx, token_data.id);
printf(" [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%d \"%s\"\n",
j,
token_data.id,
token_data.frame_index,
token_data.duration_idx,
token_data.duration_value,
token_data.p,
token_data.plog,
(long long)token_data.t0,
(long long)token_data.t1,
token_data.is_word_start,
token_str);
}
}
printf("\n");
}
int main() {
std::string model_path = PARAKEET_MODEL_PATH;
std::string sample_path = SAMPLE_PATH;
// Load the sample audio file
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
assert(pcmf32.size() > 0);
assert(pcmf32s.size() == 0);
printf("Loading Parakeet model from: %s\n", model_path.c_str());
struct parakeet_context_params ctx_params = parakeet_context_default_params();
struct parakeet_context * pctx = parakeet_init_from_file_with_params_no_state(model_path.c_str(), ctx_params);
if (pctx == nullptr) {
fprintf(stderr, "Failed to load Parakeet model\n");
return 1;
}
printf("Successfully loaded Parakeet model\n");
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
params.new_token_callback = token_callback;
params.new_token_callback_user_data = nullptr;
params.new_segment_callback = segment_callback;
params.new_segment_callback_user_data = nullptr;
parakeet_state * state = parakeet_init_state(pctx);
int ret = parakeet_chunk(pctx, state, params, pcmf32.data(), pcmf32.size());
assert(ret == 0);
parakeet_free_state(state);
parakeet_free(pctx);
printf("\nTest passed: Parakeet model loaded and freed successfully\n");
return 0;
}