Merge b899ce0b79 into fc674574ca
This commit is contained in:
commit
03443a93f3
|
|
@ -634,6 +634,8 @@ jobs:
|
|||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||
-DGGML_NATIVE=OFF
|
||||
-DGGML_BMI2=OFF
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
|
|
|
|||
|
|
@ -179,12 +179,20 @@ set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location
|
|||
get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
|
||||
|
||||
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
|
||||
|
||||
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
|
||||
|
||||
target_compile_definitions(whisper PRIVATE
|
||||
WHISPER_VERSION="${PROJECT_VERSION}"
|
||||
)
|
||||
|
||||
set_target_properties(parakeet PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/parakeet.h)
|
||||
install(TARGETS parakeet LIBRARY PUBLIC_HEADER)
|
||||
|
||||
target_compile_definitions(parakeet PRIVATE
|
||||
PARAKEET_VERSION="${PROJECT_VERSION}"
|
||||
)
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
|
||||
|
|
@ -210,6 +218,35 @@ configure_file(cmake/whisper.pc.in
|
|||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
DESTINATION lib/pkgconfig)
|
||||
|
||||
set(PARAKEET_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(PARAKEET_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(PARAKEET_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/parakeet-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/parakeet-config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parakeet
|
||||
PATH_VARS
|
||||
PARAKEET_INCLUDE_INSTALL_DIR
|
||||
PARAKEET_LIB_INSTALL_DIR
|
||||
PARAKEET_BIN_INSTALL_DIR)
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/parakeet-version.cmake
|
||||
VERSION ${WHISPER_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/parakeet-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/parakeet-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parakeet)
|
||||
|
||||
configure_file(cmake/parakeet.pc.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/parakeet.pc"
|
||||
@ONLY)
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parakeet.pc"
|
||||
DESTINATION lib/pkgconfig)
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
#
|
||||
|
|
|
|||
|
|
@ -18,6 +18,6 @@ create_makefile "whisper" do |conf|
|
|||
#{libs}: cmake-targets
|
||||
cmake-targets:
|
||||
#{"\t"}#{cmake} -S sources -B build -D BUILD_SHARED_LIBS=OFF -D CMAKE_ARCHIVE_OUTPUT_DIRECTORY=#{__dir__} -D CMAKE_POSITION_INDEPENDENT_CODE=ON #{options}
|
||||
#{"\t"}#{cmake} --build build --config Release --target common whisper
|
||||
#{"\t"}#{cmake} --build build --config Release --target common whisper parakeet
|
||||
EOF
|
||||
end
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
set(PARAKEET_VERSION @WHISPER_INSTALL_VERSION@)
|
||||
set(PARAKEET_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
|
||||
set(PARAKEET_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
|
||||
set(PARAKEET_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(PARAKEET_INCLUDE_DIR "@PACKAGE_PARAKEET_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(PARAKEET_LIB_DIR "@PACKAGE_PARAKEET_LIB_INSTALL_DIR@")
|
||||
set_and_check(PARAKEET_BIN_DIR "@PACKAGE_PARAKEET_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(ggml REQUIRED HINTS ${PARAKEET_LIB_DIR}/cmake)
|
||||
|
||||
find_library(parakeet_LIBRARY parakeet
|
||||
REQUIRED
|
||||
HINTS ${PARAKEET_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
|
||||
add_library(parakeet UNKNOWN IMPORTED)
|
||||
set_target_properties(parakeet
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${PARAKEET_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${parakeet_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
check_required_components(parakeet)
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/lib
|
||||
includedir=${prefix}/include
|
||||
|
||||
Name: parakeet
|
||||
Description: Port of NVIDIA's Parakeet model in C/C++
|
||||
Version: @PROJECT_VERSION@
|
||||
Libs: -L${libdir} -lggml -lggml-base -lparakeet
|
||||
Cflags: -I${includedir}
|
||||
|
|
@ -107,6 +107,7 @@ else()
|
|||
add_subdirectory(server)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(vad-speech-segments)
|
||||
add_subdirectory(parakeet-cli)
|
||||
if (WHISPER_SDL2)
|
||||
add_subdirectory(stream)
|
||||
add_subdirectory(command)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
set(TARGET parakeet-cli)
|
||||
add_executable(${TARGET} parakeet-cli.cpp)
|
||||
|
||||
include(DefaultTargetOptions)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common parakeet ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
# whisper.cpp/examples/parakeet-cli
|
||||
|
||||
This is an example of using the [Parakeet] model in whisper.cpp.
|
||||
|
||||
### Download converted model
|
||||
```console
|
||||
$ hf download danbev/parakeet parakeet-tdt-0.6b-v3.bin --local-dir models
|
||||
```
|
||||
|
||||
### Building
|
||||
```console
|
||||
$ cmake -B build -S .
|
||||
$ cmake --build build --target parakeet-cli -j 12
|
||||
```
|
||||
|
||||
### Usage
|
||||
```console
|
||||
$ ./build/bin/parakeet-cli --help
|
||||
|
||||
usage: ./build/bin/parakeet-cli [options] file0 file1 ...
|
||||
supported audio formats: flac, mp3, ogg, wav
|
||||
|
||||
options:
|
||||
-h, --help [default] show this help message and exit
|
||||
-t N, --threads N [4 ] number of threads to use during computation
|
||||
-cl N, --chunk-length N [10000 ] chunk length in milliseconds
|
||||
-lc N, --left-context N [10000 ] left context in milliseconds
|
||||
-rc N, --right-context N [4960 ] right context in milliseconds
|
||||
-m, --model FILE [models/ggml-parakeet-tdt-0.6b-v3.bin] model path
|
||||
-f, --file FILE [ ] input audio file
|
||||
-ng, --no-gpu [false ] disable GPU
|
||||
-dev N, --device N [0 ] GPU device to use
|
||||
-fa, --flash-attn [false ] enable flash attention
|
||||
-nfa, --no-flash-attn [false ] disable flash attention
|
||||
-ps, --print-segments [false ] print segment information
|
||||
```
|
||||
|
||||
### Example
|
||||
```console
|
||||
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav
|
||||
Processing audio (176000 samples, 11.00 seconds)
|
||||
Processing audio: total_frames=1101, chunk_size=1101
|
||||
parakeet_decode: starting decode with n_frames=138
|
||||
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
||||
```
|
||||
|
||||
To print segment information:
|
||||
```console
|
||||
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav --print-segments
|
||||
Processing audio (176000 samples, 11.00 seconds)
|
||||
Processing audio: total_frames=1101, chunk_size=1101
|
||||
parakeet_decode: starting decode with n_frames=138
|
||||
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
||||
|
||||
Segments (1):
|
||||
Segment 0: [0 -> 1101] "And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country."
|
||||
Tokens [38]:
|
||||
[ 0] id= 1976 frame= 3 dur_idx= 4 dur_val= 4 p=0.9996 plog=-15.6206 t0= 24 t1= 56 word_start=true "▁And"
|
||||
[ 1] id= 547 frame= 7 dur_idx= 4 dur_val= 4 p=0.9999 plog=-18.7922 t0= 56 t1= 88 word_start=true "▁so"
|
||||
[ 2] id= 7877 frame= 11 dur_idx= 2 dur_val= 2 p=0.8451 plog=-14.5929 t0= 88 t1= 88 word_start=false ","
|
||||
[ 3] id= 1103 frame= 13 dur_idx= 3 dur_val= 3 p=0.9996 plog=-15.6127 t0= 104 t1= 128 word_start=true "▁my"
|
||||
[ 4] id= 309 frame= 16 dur_idx= 1 dur_val= 1 p=0.9912 plog=-11.9635 t0= 128 t1= 136 word_start=true "▁f"
|
||||
[ 5] id= 530 frame= 17 dur_idx= 2 dur_val= 2 p=1.0000 plog=-13.5239 t0= 136 t1= 152 word_start=false "ell"
|
||||
[ 6] id= 596 frame= 19 dur_idx= 3 dur_val= 3 p=1.0000 plog=-16.3120 t0= 152 t1= 176 word_start=false "ow"
|
||||
[ 7] id= 3213 frame= 22 dur_idx= 4 dur_val= 4 p=0.9999 plog=-10.1462 t0= 176 t1= 208 word_start=true "▁Amer"
|
||||
[ 8] id= 404 frame= 26 dur_idx= 4 dur_val= 4 p=1.0000 plog=-25.0910 t0= 208 t1= 240 word_start=false "ic"
|
||||
[ 9] id= 667 frame= 30 dur_idx= 4 dur_val= 4 p=1.0000 plog=-27.1707 t0= 240 t1= 272 word_start=false "ans"
|
||||
[10] id= 7877 frame= 37 dur_idx= 4 dur_val= 4 p=0.9094 plog=-16.3405 t0= 272 t1= 272 word_start=false ","
|
||||
[11] id= 279 frame= 41 dur_idx= 4 dur_val= 4 p=0.9980 plog=-19.7244 t0= 328 t1= 360 word_start=true "▁a"
|
||||
[12] id= 583 frame= 45 dur_idx= 4 dur_val= 4 p=1.0000 plog=-24.5312 t0= 360 t1= 392 word_start=false "sk"
|
||||
[13] id= 1491 frame= 53 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2991 t0= 424 t1= 456 word_start=true "▁not"
|
||||
[14] id= 3470 frame= 65 dur_idx= 4 dur_val= 4 p=0.9995 plog=-16.7306 t0= 520 t1= 552 word_start=true "▁what"
|
||||
[15] id= 3629 frame= 69 dur_idx= 2 dur_val= 2 p=0.8139 plog=-11.6486 t0= 552 t1= 568 word_start=true "▁your"
|
||||
[16] id= 867 frame= 75 dur_idx= 1 dur_val= 1 p=0.9980 plog=-12.5265 t0= 600 t1= 608 word_start=true "▁co"
|
||||
[17] id= 331 frame= 76 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.6697 t0= 608 t1= 624 word_start=false "un"
|
||||
[18] id= 958 frame= 78 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.3621 t0= 624 t1= 640 word_start=false "tr"
|
||||
[19] id= 7893 frame= 80 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.3245 t0= 640 t1= 656 word_start=false "y"
|
||||
[20] id= 2059 frame= 82 dur_idx= 3 dur_val= 3 p=1.0000 plog=-17.7694 t0= 656 t1= 680 word_start=true "▁can"
|
||||
[21] id= 458 frame= 85 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2510 t0= 680 t1= 712 word_start=true "▁do"
|
||||
[22] id= 509 frame= 89 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.0688 t0= 712 t1= 744 word_start=true "▁for"
|
||||
[23] id= 1180 frame= 93 dur_idx= 4 dur_val= 4 p=0.9999 plog=-25.0567 t0= 744 t1= 776 word_start=true "▁you"
|
||||
[24] id= 7877 frame= 98 dur_idx= 4 dur_val= 4 p=0.8820 plog=-14.2549 t0= 776 t1= 776 word_start=false ","
|
||||
[25] id= 279 frame=102 dur_idx= 3 dur_val= 3 p=0.9992 plog=-16.8176 t0= 816 t1= 840 word_start=true "▁a"
|
||||
[26] id= 583 frame=105 dur_idx= 4 dur_val= 4 p=1.0000 plog=-21.0352 t0= 840 t1= 872 word_start=false "sk"
|
||||
[27] id= 3470 frame=109 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.4659 t0= 872 t1= 896 word_start=true "▁what"
|
||||
[28] id= 1180 frame=112 dur_idx= 4 dur_val= 4 p=0.9997 plog=-17.6392 t0= 896 t1= 928 word_start=true "▁you"
|
||||
[29] id= 2059 frame=116 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.5484 t0= 928 t1= 952 word_start=true "▁can"
|
||||
[30] id= 458 frame=119 dur_idx= 2 dur_val= 2 p=1.0000 plog=-15.9953 t0= 952 t1= 968 word_start=true "▁do"
|
||||
[31] id= 509 frame=121 dur_idx= 3 dur_val= 3 p=1.0000 plog=-15.9605 t0= 968 t1= 992 word_start=true "▁for"
|
||||
[32] id= 3629 frame=124 dur_idx= 2 dur_val= 2 p=0.9994 plog=-12.2083 t0= 992 t1=1008 word_start=true "▁your"
|
||||
[33] id= 867 frame=126 dur_idx= 2 dur_val= 2 p=0.9969 plog=-9.1252 t0=1008 t1=1024 word_start=true "▁co"
|
||||
[34] id= 331 frame=128 dur_idx= 1 dur_val= 1 p=0.9999 plog=-12.6911 t0=1024 t1=1032 word_start=false "un"
|
||||
[35] id= 958 frame=129 dur_idx= 1 dur_val= 1 p=1.0000 plog=-8.8885 t0=1032 t1=1040 word_start=false "tr"
|
||||
[36] id= 7893 frame=130 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.1441 t0=1040 t1=1056 word_start=false "y"
|
||||
[37] id= 7883 frame=132 dur_idx= 4 dur_val= 4 p=0.9567 plog=-11.5227 t0=1056 t1=1056 word_start=false "."
|
||||
```
|
||||
|
||||
### Model conversion
|
||||
Clone the original model from Hugging Face:
|
||||
```console
|
||||
$ git clone https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
|
||||
```
|
||||
Convert the model:
|
||||
```console
|
||||
(venv) $ python models/convert-parakeet-to-ggml.py \
|
||||
--model <path to cloned model> \
|
||||
--use-f32 \
|
||||
--out-dir models \
|
||||
--out-name ggml-parakeet-tdt-0.6b-v3.bin
|
||||
```
|
||||
|
||||
[Parakeet]: https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
|
||||
|
|
@ -0,0 +1,220 @@
|
|||
#include "parakeet.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
// command-line parameters
|
||||
struct parakeet_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t chunk_length_ms = 10000;
|
||||
int32_t left_context_ms = 10000;
|
||||
int32_t right_context_ms = 4960;
|
||||
|
||||
bool use_gpu = true;
|
||||
bool flash_attn = true;
|
||||
int32_t gpu_device = 0;
|
||||
|
||||
bool print_segments = false;
|
||||
|
||||
std::string model = "models/ggml-parakeet-tdt-0.6b-v3.bin";
|
||||
std::vector<std::string> fname_inp = {};
|
||||
};
|
||||
|
||||
static void parakeet_print_usage(int argc, char ** argv, const parakeet_params & params);
|
||||
|
||||
static char * requires_value_error(const std::string & arg) {
|
||||
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static bool parakeet_params_parse(int argc, char ** argv, parakeet_params & params) {
|
||||
if (const char * env_device = std::getenv("PARAKEET_ARG_DEVICE")) {
|
||||
params.gpu_device = std::stoi(env_device);
|
||||
}
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-"){
|
||||
params.fname_inp.push_back(arg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg[0] != '-') {
|
||||
params.fname_inp.push_back(arg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
parakeet_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-cl" || arg == "--chunk-length") { params.chunk_length_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-lc" || arg == "--left-context") { params.left_context_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-rc" || arg == "--right-context") { params.right_context_ms = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); }
|
||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||
else if (arg == "-dev" || arg == "--device") { params.gpu_device = std::stoi(ARGV_NEXT); }
|
||||
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = false; }
|
||||
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
|
||||
else if (arg == "-ps" || arg == "--print-segments") { params.print_segments = true; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
parakeet_print_usage(argc, argv, params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void parakeet_print_usage(int /*argc*/, char ** argv, const parakeet_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
|
||||
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -cl N, --chunk-length N [%-7d] chunk length in milliseconds\n", params.chunk_length_ms);
|
||||
fprintf(stderr, " -lc N, --left-context N [%-7d] left context in milliseconds\n", params.left_context_ms);
|
||||
fprintf(stderr, " -rc N, --right-context N [%-7d] right context in milliseconds\n", params.right_context_ms);
|
||||
fprintf(stderr, " -m, --model FILE [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f, --file FILE [%-7s] input audio file\n", "");
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, " -dev N, --device N [%-7d] GPU device to use\n", params.gpu_device);
|
||||
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", !params.flash_attn ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-segments [%-7s] print segment information\n", params.print_segments ? "true" : "false");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
|
||||
static bool is_first = true;
|
||||
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
|
||||
char text_buf[256];
|
||||
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
|
||||
printf("%s", text_buf);
|
||||
fflush(stdout);
|
||||
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_load_all();
|
||||
|
||||
parakeet_params params;
|
||||
|
||||
if (parakeet_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.fname_inp.empty()) {
|
||||
fprintf(stderr, "error: no input files specified\n");
|
||||
parakeet_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Process each input file
|
||||
for (const auto & fname : params.fname_inp) {
|
||||
fprintf(stderr, "\nProcessing file: %s\n", fname.c_str());
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
if (!read_audio_data(fname.c_str(), pcmf32, pcmf32s, false)) {
|
||||
fprintf(stderr, "error: failed to read audio file '%s'\n", fname.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pcmf32.empty()) {
|
||||
fprintf(stderr, "error: no audio data in file '%s'\n", fname.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Loading Parakeet model from: %s\n", params.model.c_str());
|
||||
|
||||
struct parakeet_context_params ctx_params = parakeet_context_default_params();
|
||||
ctx_params.use_gpu = params.use_gpu;
|
||||
ctx_params.flash_attn = params.flash_attn;
|
||||
ctx_params.gpu_device = params.gpu_device;
|
||||
|
||||
struct parakeet_context * pctx = parakeet_init_from_file_with_params(params.model.c_str(), ctx_params);
|
||||
if (pctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to load Parakeet model from '%s'\n", params.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Successfully loaded Parakeet model\n");
|
||||
fprintf(stderr, "Processing audio (%zu samples, %.2f seconds)\n",
|
||||
pcmf32.size(), (float)pcmf32.size() / PARAKEET_SAMPLE_RATE);
|
||||
|
||||
struct parakeet_full_params full_params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
|
||||
full_params.n_threads = params.n_threads;
|
||||
full_params.chunk_length_ms = params.chunk_length_ms;
|
||||
full_params.left_context_ms = params.left_context_ms;
|
||||
full_params.right_context_ms = params.right_context_ms;
|
||||
full_params.new_token_callback = token_callback;
|
||||
full_params.new_token_callback_user_data = nullptr;
|
||||
|
||||
const int mel_frames = (int)(pcmf32.size() / PARAKEET_HOP_LENGTH);
|
||||
if (mel_frames <= parakeet_n_audio_ctx(pctx)) {
|
||||
full_params.chunk_length_ms = 0;
|
||||
}
|
||||
|
||||
int ret = parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());
|
||||
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "error: failed to process audio file '%s'\n", fname.c_str());
|
||||
parakeet_free(pctx);
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
if (params.print_segments) {
|
||||
const int n_segments = parakeet_full_n_segments(pctx);
|
||||
fprintf(stderr, "\nSegments (%d):\n", n_segments);
|
||||
|
||||
for (int i = 0; i < n_segments; i++) {
|
||||
const char * text = parakeet_full_get_segment_text(pctx, i);
|
||||
const int64_t t0 = parakeet_full_get_segment_t0(pctx, i);
|
||||
const int64_t t1 = parakeet_full_get_segment_t1(pctx, i);
|
||||
const int n_tokens = parakeet_full_n_tokens(pctx, i);
|
||||
|
||||
fprintf(stderr, "Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
|
||||
fprintf(stderr, "Tokens [%d]:\n", n_tokens);
|
||||
|
||||
for (int j = 0; j < n_tokens; j++) {
|
||||
parakeet_token_data token_data = parakeet_full_get_token_data(pctx, i, j);
|
||||
const char * token_str = parakeet_token_to_str(pctx, token_data.id);
|
||||
|
||||
fprintf(stderr, " [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%s \"%s\"\n",
|
||||
j,
|
||||
token_data.id,
|
||||
token_data.frame_index,
|
||||
token_data.duration_idx,
|
||||
token_data.duration_value,
|
||||
token_data.p,
|
||||
token_data.plog,
|
||||
(long long)token_data.t0,
|
||||
(long long)token_data.t1,
|
||||
token_data.is_word_start ? "true": "false",
|
||||
token_str);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parakeet_free(pctx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,383 @@
|
|||
#ifndef PARAKEET_H
|
||||
#define PARAKEET_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define PARAKEET_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||
#elif defined(_MSC_VER)
|
||||
# define PARAKEET_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||
#else
|
||||
# define PARAKEET_DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#ifdef PARAKEET_SHARED
|
||||
# ifdef _WIN32
|
||||
# ifdef PARAKEET_BUILD
|
||||
# define PARAKEET_API __declspec(dllexport)
|
||||
# else
|
||||
# define PARAKEET_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define PARAKEET_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define PARAKEET_API
|
||||
#endif
|
||||
|
||||
#define PARAKEET_SAMPLE_RATE 16000
|
||||
#define PARAKEET_HOP_LENGTH 160
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct parakeet_context;
|
||||
struct parakeet_state;
|
||||
struct parakeet_full_params;
|
||||
|
||||
typedef int32_t parakeet_pos;
|
||||
typedef int32_t parakeet_token;
|
||||
typedef int32_t parakeet_seq_id;
|
||||
|
||||
struct parakeet_context_params {
|
||||
bool use_gpu;
|
||||
bool flash_attn;
|
||||
int gpu_device; // CUDA device
|
||||
};
|
||||
|
||||
typedef struct parakeet_token_data {
|
||||
parakeet_token id; // the BPE subword ID (0-8191)
|
||||
|
||||
int duration_idx; // index into the models durations array
|
||||
int duration_value; // actual duration value
|
||||
int frame_index;
|
||||
|
||||
float p;
|
||||
float plog;
|
||||
|
||||
int64_t t0;
|
||||
int64_t t1;
|
||||
|
||||
bool is_word_start;
|
||||
} parakeet_token_data;
|
||||
|
||||
typedef struct parakeet_model_loader {
|
||||
void * context;
|
||||
|
||||
size_t (*read)(void * ctx, void * output, size_t read_size);
|
||||
bool (*eof)(void * ctx);
|
||||
void (*close)(void * ctx);
|
||||
} parakeet_model_loader;
|
||||
|
||||
PARAKEET_API const char * parakeet_version(void);
|
||||
|
||||
// Various functions for loading a ggml parakeet model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_from_file_with_params (const char * path_model, struct parakeet_context_params params);
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct parakeet_context_params params);
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_with_params (struct parakeet_model_loader * loader, struct parakeet_context_params params);
|
||||
|
||||
// These are the same as the above, but the internal state of the context is not allocated automatically
|
||||
// It is the responsibility of the caller to allocate the state using parakeet_init_state() (#523)
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_from_file_with_params_no_state (const char * path_model, struct parakeet_context_params params);
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct parakeet_context_params params);
|
||||
PARAKEET_API struct parakeet_context * parakeet_init_with_params_no_state (struct parakeet_model_loader * loader, struct parakeet_context_params params);
|
||||
|
||||
PARAKEET_API struct parakeet_state * parakeet_init_state(struct parakeet_context * ctx);
|
||||
|
||||
// Frees all allocated memory
|
||||
PARAKEET_API void parakeet_free (struct parakeet_context * ctx);
|
||||
PARAKEET_API void parakeet_free_state(struct parakeet_state * state);
|
||||
PARAKEET_API void parakeet_free_params(struct parakeet_full_params * params);
|
||||
PARAKEET_API void parakeet_free_context_params(struct parakeet_context_params * params);
|
||||
|
||||
// Convert RAW PCM audio to log mel spectrogram.
|
||||
// The resulting spectrogram is stored inside the default state of the provided parakeet context.
|
||||
// Returns 0 on success
|
||||
PARAKEET_API int parakeet_pcm_to_mel(
|
||||
struct parakeet_context * ctx,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
|
||||
PARAKEET_API int parakeet_pcm_to_mel_with_state(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
|
||||
// This can be used to set a custom log mel spectrogram inside the default state of the provided parakeet context.
|
||||
// Use this instead of parakeet_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||
// n_mel must be 80
|
||||
// Returns 0 on success
|
||||
PARAKEET_API int parakeet_set_mel(
|
||||
struct parakeet_context * ctx,
|
||||
const float * data,
|
||||
int n_len,
|
||||
int n_mel);
|
||||
|
||||
PARAKEET_API int parakeet_set_mel_with_state(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
const float * data,
|
||||
int n_len,
|
||||
int n_mel);
|
||||
|
||||
// Run the Parakeet encoder on the log mel spectrogram stored inside the default state in the provided parakeet context.
|
||||
// Make sure to call parakeet_pcm_to_mel() or parakeet_set_mel() first.
|
||||
// offset can be used to specify the offset of the first frame in the spectrogram.
|
||||
// Returns 0 on success
|
||||
PARAKEET_API int parakeet_encode(
|
||||
struct parakeet_context * ctx,
|
||||
int offset,
|
||||
int n_threads);
|
||||
|
||||
PARAKEET_API int parakeet_encode_with_state(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
int offset,
|
||||
int n_threads);
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
PARAKEET_API int parakeet_tokenize(
|
||||
struct parakeet_context * ctx,
|
||||
const char * text,
|
||||
parakeet_token * tokens,
|
||||
int n_max_tokens);
|
||||
|
||||
// Return the number of tokens in the provided text
|
||||
// Equivalent to: -parakeet_tokenize(ctx, text, NULL, 0)
|
||||
int parakeet_token_count(struct parakeet_context * ctx, const char * text);
|
||||
|
||||
PARAKEET_API int parakeet_n_len (struct parakeet_context * ctx); // mel length
|
||||
PARAKEET_API int parakeet_n_len_from_state(struct parakeet_state * state); // mel length
|
||||
PARAKEET_API int parakeet_n_vocab (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_n_audio_ctx (struct parakeet_context * ctx);
|
||||
|
||||
PARAKEET_API int parakeet_model_n_vocab (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_n_audio_ctx (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_n_audio_state(struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_n_audio_head (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_n_audio_layer(struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_n_mels (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_model_ftype (struct parakeet_context * ctx);
|
||||
|
||||
// Token logits obtained from the last call to parakeet_full/parakeet_chunk
|
||||
// The logits for the last token are stored in the last row
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
PARAKEET_API float * parakeet_get_logits (struct parakeet_context * ctx);
|
||||
PARAKEET_API float * parakeet_get_logits_from_state(struct parakeet_state * state);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
PARAKEET_API const char * parakeet_token_to_str(struct parakeet_context * ctx, parakeet_token token);
|
||||
|
||||
PARAKEET_API int parakeet_token_to_text(const char * token_str, bool is_first, char * output, int max_len);
|
||||
|
||||
// Special tokens
|
||||
PARAKEET_API parakeet_token parakeet_token_blank (struct parakeet_context * ctx);
|
||||
PARAKEET_API parakeet_token parakeet_token_unk (struct parakeet_context * ctx);
|
||||
PARAKEET_API parakeet_token parakeet_token_bos(struct parakeet_context * ctx);
|
||||
|
||||
// Performance information from the default state.
|
||||
struct parakeet_timings {
|
||||
float sample_ms;
|
||||
float encode_ms;
|
||||
float decode_ms;
|
||||
float batchd_ms;
|
||||
float prompt_ms;
|
||||
};
|
||||
PARAKEET_API struct parakeet_timings * parakeet_get_timings(struct parakeet_context * ctx);
|
||||
PARAKEET_API void parakeet_print_timings(struct parakeet_context * ctx);
|
||||
PARAKEET_API void parakeet_reset_timings(struct parakeet_context * ctx);
|
||||
|
||||
// Print system information
|
||||
PARAKEET_API const char * parakeet_print_system_info(void);
|
||||
|
||||
// Available sampling strategies
|
||||
enum parakeet_sampling_strategy {
|
||||
PARAKEET_SAMPLING_GREEDY,
|
||||
};
|
||||
|
||||
// Token callback.
|
||||
// Called for each new predicted token.
|
||||
// Use the parakeet_full_...() functions to obtain the text segments
|
||||
typedef void (*parakeet_new_token_callback)(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
const parakeet_token_data * token_data,
|
||||
void * user_data);
|
||||
|
||||
// Text segment callback
|
||||
// Called on every newly generated text segment
|
||||
// Use the parakeet_full_...() functions to obtain the text segments
|
||||
typedef void (*parakeet_new_segment_callback)(struct parakeet_context * ctx, struct parakeet_state * state, int n_new, void * user_data);
|
||||
|
||||
// Progress callback
|
||||
typedef void (*parakeet_progress_callback)(struct parakeet_context * ctx, struct parakeet_state * state, int progress, void * user_data);
|
||||
|
||||
// Encoder begin callback
|
||||
// If not NULL, called before the encoder starts
|
||||
// If it returns false, the computation is aborted
|
||||
typedef bool (*parakeet_encoder_begin_callback)(struct parakeet_context * ctx, struct parakeet_state * state, void * user_data);
|
||||
|
||||
// Parameters for the parakeet_full() function
|
||||
// If you change the order or add new parameters, make sure to update the default values in parakeet.cpp:
|
||||
// parakeet_full_default_params()
|
||||
struct parakeet_full_params {
|
||||
enum parakeet_sampling_strategy strategy;
|
||||
|
||||
int n_threads;
|
||||
int offset_ms; // start offset in ms
|
||||
int duration_ms; // audio duration to process in ms
|
||||
|
||||
bool no_context; // do not use past transcription (if any) as context
|
||||
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
int audio_ctx; // overwrite the audio context size (0 = use default)
|
||||
|
||||
int chunk_length_ms; // length of each chunk in ms
|
||||
int left_context_ms; // left context in ms
|
||||
int right_context_ms; // right context in ms
|
||||
|
||||
// called for every newly generated text segment
|
||||
parakeet_new_segment_callback new_segment_callback;
|
||||
void * new_segment_callback_user_data;
|
||||
|
||||
// called for every newly generated token
|
||||
parakeet_new_token_callback new_token_callback;
|
||||
void * new_token_callback_user_data;
|
||||
|
||||
// called on each progress update
|
||||
parakeet_progress_callback progress_callback;
|
||||
void * progress_callback_user_data;
|
||||
|
||||
// called each time before the encoder starts
|
||||
parakeet_encoder_begin_callback encoder_begin_callback;
|
||||
void * encoder_begin_callback_user_data;
|
||||
|
||||
// called each time before ggml computation starts
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_user_data;
|
||||
};
|
||||
|
||||
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see parakeet_free_context_params() & parakeet_free_params()
|
||||
PARAKEET_API struct parakeet_context_params * parakeet_context_default_params_by_ref(void);
|
||||
PARAKEET_API struct parakeet_context_params parakeet_context_default_params (void);
|
||||
|
||||
PARAKEET_API struct parakeet_full_params * parakeet_full_default_params_by_ref(enum parakeet_sampling_strategy strategy);
|
||||
PARAKEET_API struct parakeet_full_params parakeet_full_default_params (enum parakeet_sampling_strategy strategy);
|
||||
|
||||
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
// Not thread safe for same context
|
||||
PARAKEET_API int parakeet_full(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_full_params params,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
PARAKEET_API int parakeet_full_with_state(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
struct parakeet_full_params params,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using parakeet_full_with_state()
|
||||
// Result is stored in the default state of the context
|
||||
// Not thread safe if executed in parallel on the same context.
|
||||
PARAKEET_API int parakeet_full_parallel(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_full_params params,
|
||||
const float * samples,
|
||||
int n_samples,
|
||||
int n_processors);
|
||||
|
||||
// Process a single chunk of audio data that fits within the model's audio context window.
|
||||
// This is more efficient than parakeet_full() for short audio clips.
|
||||
PARAKEET_API int parakeet_chunk(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
struct parakeet_full_params params,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Initialize streaming state for a new stream.
|
||||
PARAKEET_API int parakeet_stream_init(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
struct parakeet_full_params params);
|
||||
|
||||
// Push audio samples in streaming mode. Internally this function will structure
|
||||
// the samples in a buffer where with a left context, a center chunk, and a
|
||||
// right context. The encoder will see the complete buffer which enables it
|
||||
// to get boundry context for the target/center audio chunk. This avoids hard
|
||||
// cut offs at the chunk boundaries. The joint network then only sees the
|
||||
// center chunk and this function internally handles the context windowing.
|
||||
PARAKEET_API int parakeet_stream_push(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state,
|
||||
const float * samples,
|
||||
int n_samples);
|
||||
|
||||
// Flush the final partial chunk at end-of-stream.
|
||||
PARAKEET_API int parakeet_stream_flush(
|
||||
struct parakeet_context * ctx,
|
||||
struct parakeet_state * state);
|
||||
|
||||
// Number of generated text segments
|
||||
PARAKEET_API int parakeet_full_n_segments (struct parakeet_context * ctx);
|
||||
PARAKEET_API int parakeet_full_n_segments_from_state(struct parakeet_state * state);
|
||||
|
||||
// Get the start and end time of the specified segment
|
||||
PARAKEET_API int64_t parakeet_full_get_segment_t0 (struct parakeet_context * ctx, int i_segment);
|
||||
PARAKEET_API int64_t parakeet_full_get_segment_t0_from_state(struct parakeet_state * state, int i_segment);
|
||||
|
||||
PARAKEET_API int64_t parakeet_full_get_segment_t1 (struct parakeet_context * ctx, int i_segment);
|
||||
PARAKEET_API int64_t parakeet_full_get_segment_t1_from_state(struct parakeet_state * state, int i_segment);
|
||||
|
||||
// Get the text of the specified segment
|
||||
PARAKEET_API const char * parakeet_full_get_segment_text (struct parakeet_context * ctx, int i_segment);
|
||||
PARAKEET_API const char * parakeet_full_get_segment_text_from_state(struct parakeet_state * state, int i_segment);
|
||||
|
||||
// Get number of tokens in the specified segment
|
||||
PARAKEET_API int parakeet_full_n_tokens (struct parakeet_context * ctx, int i_segment);
|
||||
PARAKEET_API int parakeet_full_n_tokens_from_state(struct parakeet_state * state, int i_segment);
|
||||
|
||||
// Get the token text of the specified token in the specified segment
|
||||
PARAKEET_API const char * parakeet_full_get_token_text (struct parakeet_context * ctx, int i_segment, int i_token);
|
||||
PARAKEET_API const char * parakeet_full_get_token_text_from_state(struct parakeet_context * ctx, struct parakeet_state * state, int i_segment, int i_token);
|
||||
|
||||
// Get the token id of the specified token in the specified segment
|
||||
PARAKEET_API parakeet_token parakeet_full_get_token_id (struct parakeet_context * ctx, int i_segment, int i_token);
|
||||
PARAKEET_API parakeet_token parakeet_full_get_token_id_from_state(struct parakeet_state * state, int i_segment, int i_token);
|
||||
|
||||
// Get token data for the specified token in the specified segment
|
||||
PARAKEET_API parakeet_token_data parakeet_full_get_token_data (struct parakeet_context * ctx, int i_segment, int i_token);
|
||||
PARAKEET_API parakeet_token_data parakeet_full_get_token_data_from_state(struct parakeet_state * state, int i_segment, int i_token);
|
||||
|
||||
// Get the probability of the specified token in the specified segment
|
||||
PARAKEET_API float parakeet_full_get_token_p (struct parakeet_context * ctx, int i_segment, int i_token);
|
||||
PARAKEET_API float parakeet_full_get_token_p_from_state(struct parakeet_state * state, int i_segment, int i_token);
|
||||
|
||||
// Control logging output; default behavior is to print to stderr
|
||||
|
||||
PARAKEET_API void parakeet_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,331 @@
|
|||
#!/usr/bin/env python3
|
||||
# Convert Parakeet TDT model from NeMo format to ggml format
|
||||
#
|
||||
# Usage: python convert-parakeet-to-ggml.py --model parakeet-model.nemo --output-dir output-dir [--use-f32]
|
||||
#
|
||||
# The NeMo file is a tar archive containing:
|
||||
# - model_weights.ckpt (PyTorch checkpoint)
|
||||
# - model_config.yaml (model configuration)
|
||||
# - tokenizer files
|
||||
#
|
||||
# This script extracts the NeMo archive, loads the model weights and configuration,
|
||||
# and saves them in ggml format compatible with whisper.cpp.
|
||||
#
|
||||
|
||||
import torch
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import struct
|
||||
import tarfile
|
||||
import tempfile
|
||||
import shutil
|
||||
import yaml
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
def hz_to_mel(freq):
|
||||
return 2595.0 * np.log10(1.0 + freq / 700.0)
|
||||
|
||||
def mel_to_hz(mel):
|
||||
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||
|
||||
def create_relative_positional_encoding(d_model: int, n_pos_max_len: int) -> np.ndarray:
|
||||
max_len = n_pos_max_len * 2 - 1
|
||||
log_10000 = np.log(10000.0)
|
||||
pe = np.zeros((max_len, d_model), dtype=np.float32)
|
||||
|
||||
for idx in range(max_len):
|
||||
position = float((max_len // 2) - idx)
|
||||
|
||||
for i in range(0, d_model, 2):
|
||||
div_term = np.exp(-float(i) * log_10000 / float(d_model))
|
||||
angle = position * div_term
|
||||
|
||||
pe[idx, i] = np.sin(angle)
|
||||
pe[idx, i + 1] = np.cos(angle)
|
||||
|
||||
return pe
|
||||
|
||||
def extract_nemo_archive(nemo_path, extract_dir):
|
||||
print(f"Extracting {nemo_path} to {extract_dir}")
|
||||
with tarfile.open(nemo_path, 'r') as tar:
|
||||
tar.extractall(path=extract_dir)
|
||||
print("Extraction complete")
|
||||
|
||||
def load_model_config(config_path):
|
||||
with open(config_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
return config
|
||||
|
||||
def load_tokenizer(extract_dir, config):
|
||||
tokenizer_model_path = None
|
||||
tokenizer_vocab_path = None
|
||||
|
||||
for file in os.listdir(extract_dir):
|
||||
if file.endswith('_tokenizer.model'):
|
||||
tokenizer_model_path = os.path.join(extract_dir, file)
|
||||
elif file.endswith('tokenizer.vocab'):
|
||||
tokenizer_vocab_path = os.path.join(extract_dir, file)
|
||||
|
||||
if not tokenizer_model_path:
|
||||
raise FileNotFoundError("Tokenizer model file not found")
|
||||
|
||||
if not tokenizer_vocab_path:
|
||||
raise FileNotFoundError("Tokenizer vocab file not found")
|
||||
|
||||
tokens = {}
|
||||
with open(tokenizer_vocab_path, 'r', encoding='utf-8') as f:
|
||||
for idx, line in enumerate(f):
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) >= 1:
|
||||
token = parts[0]
|
||||
tokens[token.encode('utf-8')] = idx
|
||||
|
||||
print(f"Loaded {len(tokens)} tokens from {os.path.basename(tokenizer_vocab_path)}")
|
||||
|
||||
if len(tokens) != 8192:
|
||||
print(f"WARNING: Expected 8192 tokens, got {len(tokens)}")
|
||||
|
||||
return tokens
|
||||
|
||||
def write_tensor(fout, name, data, use_f16=True, force_f32=False):
|
||||
if 'pre_encode.conv' in name and 'bias' in name and len(data.shape) == 1:
|
||||
data = data.reshape(1, -1, 1, 1)
|
||||
print(f" Reshaped conv bias {name} to {data.shape}")
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
ftype = 1 if use_f16 and not force_f32 else 0
|
||||
if force_f32:
|
||||
data = data.astype(np.float32)
|
||||
elif use_f16:
|
||||
if n_dims < 2 or 'bias' in name or 'norm' in name:
|
||||
data = data.astype(np.float32)
|
||||
ftype = 0
|
||||
else:
|
||||
data = data.astype(np.float16)
|
||||
else:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
dims_reversed = [data.shape[n_dims - 1 - i] for i in range(n_dims)]
|
||||
print(f"Processing: {name} {list(data.shape)}, dtype: {data.dtype}, n_dims: {n_dims}, reversed: {dims_reversed}")
|
||||
name_bytes = name.encode('utf-8')
|
||||
fout.write(struct.pack("iii", n_dims, len(name_bytes), ftype))
|
||||
for i in range(n_dims):
|
||||
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
||||
fout.write(name_bytes)
|
||||
|
||||
data.tofile(fout)
|
||||
|
||||
def convert_parakeet_to_ggml(nemo_path, output_dir, use_f16=True, out_name=None):
|
||||
nemo_path = Path(nemo_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create temporary directory for extraction
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
extract_nemo_archive(nemo_path, temp_dir)
|
||||
|
||||
config_path = os.path.join(temp_dir, 'model_config.yaml')
|
||||
config = load_model_config(config_path)
|
||||
|
||||
print("Model configuration:")
|
||||
print(f" Sample rate: {config['sample_rate']}")
|
||||
print(f" Encoder layers: {config['encoder']['n_layers']}")
|
||||
print(f" Encoder d_model: {config['encoder']['d_model']}")
|
||||
print(f" Mel features: {config['preprocessor']['features']}")
|
||||
|
||||
weights_path = os.path.join(temp_dir, 'model_weights.ckpt')
|
||||
print(f"\nLoading model weights from {weights_path}")
|
||||
checkpoint = torch.load(weights_path, map_location='cpu')
|
||||
|
||||
# Extract state dict
|
||||
if 'state_dict' in checkpoint:
|
||||
state_dict = checkpoint['state_dict']
|
||||
else:
|
||||
state_dict = checkpoint
|
||||
|
||||
print(f"Loaded {len(state_dict)} tensors")
|
||||
|
||||
# Load tokenizer
|
||||
print("\nLoading tokenizer...")
|
||||
tokens = load_tokenizer(temp_dir, config)
|
||||
print(f"Loaded {len(tokens)} tokens")
|
||||
|
||||
# Prepare hyperparameters for the Parakeet ggml format.
|
||||
hparams = {
|
||||
'n_audio_ctx': 5000,
|
||||
'n_audio_state': config['encoder']['d_model'],
|
||||
'n_audio_head': config['encoder']['n_heads'],
|
||||
'n_audio_layer': config['encoder']['n_layers'],
|
||||
'n_mels': config['preprocessor']['features'],
|
||||
'n_fft': config['preprocessor']['n_fft'],
|
||||
'subsampling_factor': config['encoder']['subsampling_factor'],
|
||||
'n_subsampling_channels': config['encoder']['subsampling_conv_channels'],
|
||||
'n_pos_max_len': config['encoder']['pos_emb_max_len'],
|
||||
|
||||
'n_pred_dim': config['decoder']['prednet']['pred_hidden'],
|
||||
'n_pred_layers': config['decoder']['prednet']['pred_rnn_layers'],
|
||||
'n_vocab': config['decoder']['vocab_size'],
|
||||
'n_tdt_durations': config['model_defaults']['num_tdt_durations'],
|
||||
'n_max_tokens': config['decoding']['greedy']['max_symbols'],
|
||||
}
|
||||
|
||||
print("\nGGML hyperparameters:")
|
||||
for key, value in hparams.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
pe = create_relative_positional_encoding(hparams['n_audio_state'], hparams['n_pos_max_len'])
|
||||
print(f"\nGenerated positional encoding tensor 'encoder.pe' with shape {pe.shape}")
|
||||
|
||||
# Create output file
|
||||
if out_name:
|
||||
fname_out = output_dir / out_name
|
||||
else:
|
||||
fname_out = output_dir / ("ggml-model-f32.bin" if not use_f16 else "ggml-model.bin")
|
||||
print(f"\nWriting to {fname_out}")
|
||||
|
||||
with open(fname_out, 'wb') as fout:
|
||||
# Write magic number
|
||||
fout.write(struct.pack("i", 0x67676d6c)) # 'ggml' in hex
|
||||
|
||||
# Write hyperparameters
|
||||
fout.write(struct.pack("i", hparams['n_vocab']))
|
||||
fout.write(struct.pack("i", hparams['n_audio_ctx']))
|
||||
fout.write(struct.pack("i", hparams['n_audio_state']))
|
||||
fout.write(struct.pack("i", hparams['n_audio_head']))
|
||||
fout.write(struct.pack("i", hparams['n_audio_layer']))
|
||||
fout.write(struct.pack("i", hparams['n_mels']))
|
||||
fout.write(struct.pack("i", 1 if use_f16 else 0))
|
||||
fout.write(struct.pack("i", hparams['n_fft']))
|
||||
fout.write(struct.pack("i", hparams['subsampling_factor']))
|
||||
fout.write(struct.pack("i", hparams['n_subsampling_channels']))
|
||||
fout.write(struct.pack("i", hparams['n_pos_max_len']))
|
||||
fout.write(struct.pack("i", hparams['n_pred_dim']))
|
||||
fout.write(struct.pack("i", hparams['n_pred_layers']))
|
||||
fout.write(struct.pack("i", hparams['n_tdt_durations']))
|
||||
fout.write(struct.pack("i", hparams['n_max_tokens']))
|
||||
|
||||
# Extract mel filterbank from model
|
||||
fb_key = None
|
||||
for key in state_dict.keys():
|
||||
if 'featurizer.fb' in key or 'filterbank' in key.lower():
|
||||
fb_key = key
|
||||
break
|
||||
|
||||
if not fb_key:
|
||||
print("\nERROR: Mel filterbank not found in model!")
|
||||
print("Expected tensor with 'featurizer.fb' or 'filterbank' in name")
|
||||
print("\nAvailable preprocessor tensors:")
|
||||
for key in sorted(state_dict.keys()):
|
||||
if 'preprocessor' in key or 'featurizer' in key:
|
||||
print(f" {key}: {state_dict[key].shape}")
|
||||
raise ValueError("Mel filterbank tensor not found in model")
|
||||
|
||||
print(f"\nUsing model's mel filterbank from: {fb_key}")
|
||||
mel_filters = state_dict[fb_key].squeeze().numpy().astype(np.float32)
|
||||
print(f" Filterbank shape: {mel_filters.shape}")
|
||||
print(f" Filterbank min/max values: {mel_filters.min():.6f} / {mel_filters.max():.6f}")
|
||||
print(f" Filterbank non-zero elements: {np.count_nonzero(mel_filters)} / {mel_filters.size}")
|
||||
print(f" First row sum: {mel_filters[0].sum():.6f}")
|
||||
|
||||
if len(mel_filters.shape) != 2:
|
||||
raise ValueError(f"Expected 2D filterbank, got shape {mel_filters.shape}")
|
||||
|
||||
n_mels, n_freqs = mel_filters.shape
|
||||
fout.write(struct.pack("i", n_mels)) # n_mel
|
||||
fout.write(struct.pack("i", n_freqs)) # n_fb (frequency bins)
|
||||
|
||||
# Write mel filterbank
|
||||
for i in range(n_mels):
|
||||
for j in range(n_freqs):
|
||||
fout.write(struct.pack("f", mel_filters[i, j]))
|
||||
|
||||
# Extract window function from model
|
||||
window_key = None
|
||||
for key in state_dict.keys():
|
||||
if 'featurizer.window' in key or 'preproc' in key and 'window' in key:
|
||||
window_key = key
|
||||
break
|
||||
|
||||
if not window_key:
|
||||
print("\nERROR: Window function not found in model!")
|
||||
print("Expected tensor with 'featurizer.window' in name")
|
||||
raise ValueError("Window function tensor not found in model")
|
||||
|
||||
print(f"\nUsing model's window function from: {window_key}")
|
||||
window = state_dict[window_key].squeeze().numpy().astype(np.float32)
|
||||
print(f" Window shape: {window.shape}")
|
||||
print(f" Window min/max values: {window.min():.6f} / {window.max():.6f}")
|
||||
print(f" Window non-zero elements: {np.count_nonzero(window)} / {window.size}")
|
||||
print(f" Window sum: {window.sum():.6f}")
|
||||
|
||||
if len(window.shape) != 1:
|
||||
raise ValueError(f"Expected 1D window, got shape {window.shape}")
|
||||
|
||||
n_window = window.shape[0]
|
||||
fout.write(struct.pack("i", n_window))
|
||||
|
||||
# Write window function
|
||||
for i in range(n_window):
|
||||
fout.write(struct.pack("f", window[i]))
|
||||
|
||||
# Write TDT durations
|
||||
tdt_durations = config['model_defaults']['tdt_durations']
|
||||
if len(tdt_durations) != hparams['n_tdt_durations']:
|
||||
raise ValueError(f"TDT durations count mismatch: {len(tdt_durations)} vs {hparams['n_tdt_durations']}")
|
||||
|
||||
for duration in tdt_durations:
|
||||
fout.write(struct.pack("I", duration))
|
||||
|
||||
fout.write(struct.pack("i", len(tokens)))
|
||||
for token_bytes, idx in sorted(tokens.items(), key=lambda x: x[1]):
|
||||
fout.write(struct.pack("i", len(token_bytes)))
|
||||
fout.write(token_bytes)
|
||||
|
||||
print("\nConverting model weights...")
|
||||
for name, tensor in state_dict.items():
|
||||
# Skip the filterbank and window - already written in preprocessing section
|
||||
if name == fb_key:
|
||||
continue
|
||||
if name == window_key:
|
||||
continue
|
||||
|
||||
# Don't squeeze Conv2d weights - they need to preserve all 4 dimensions
|
||||
if 'conv' in name and 'weight' in name and len(tensor.shape) == 4:
|
||||
data = tensor.numpy()
|
||||
else:
|
||||
data = tensor.squeeze().numpy()
|
||||
|
||||
write_tensor(fout, name, data, use_f16=use_f16)
|
||||
|
||||
write_tensor(fout, "encoder.pe", pe, use_f16=use_f16, force_f32=True)
|
||||
|
||||
print(f"\nConversion complete!")
|
||||
print(f"Output file: {fname_out}")
|
||||
print(f"File size: {fname_out.stat().st_size / (1024**2):.2f} MB")
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert Parakeet TDT model from NeMo format to ggml format'
|
||||
)
|
||||
parser.add_argument('--model', type=str, required=True,
|
||||
help='Path to Parakeet .nemo model file')
|
||||
parser.add_argument('--out-dir', type=str, required=True,
|
||||
help='Directory to write ggml model file')
|
||||
parser.add_argument('--use-f32', action='store_true', default=False,
|
||||
help='Use f32 instead of f16 (default: f16)')
|
||||
parser.add_argument('--out-name', type=str, default=None,
|
||||
help='Output file name (default: ggml-model.bin or ggml-model-f32.bin)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.model):
|
||||
print(f"Error: {args.model} not found")
|
||||
sys.exit(1)
|
||||
|
||||
use_f16 = not args.use_f32
|
||||
convert_parakeet_to_ggml(args.model, args.out_dir, use_f16, args.out_name)
|
||||
|
|
@ -0,0 +1 @@
|
|||
pyyaml
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import os
|
||||
from huggingface_hub import HfApi, create_repo
|
||||
|
||||
# TODO: change to ggml-org once merged.
|
||||
USER_NAME = "danbev"
|
||||
REPO_ID = f"{USER_NAME}/parakeet"
|
||||
LOCAL_GGUF_PATH = "models/ggml-parakeet-tdt-0.6b-v3.bin"
|
||||
REMOTE_GGUF_NAME = "parakeet-tdt-0.6b-v3.bin"
|
||||
|
||||
MODEL_CARD_CONTENT = f"""---
|
||||
license: apache-2.0
|
||||
base_model: {USER_NAME}/parakeet
|
||||
tags:
|
||||
- gguf
|
||||
---
|
||||
|
||||
# Parakeet Model Card
|
||||
|
||||
## Description
|
||||
This is an iterative release of the Parakeet model in whisper.cpp format.
|
||||
|
||||
## Usage
|
||||
You can use this file with [parakeet-cli](https://github.com/danbev/whisper.cpp/tree/parakeet-support/examples/parakeet-cli).
|
||||
|
||||
Build parakeet-cli:
|
||||
```console
|
||||
$ git clone -b parakeet-support https://github.com/danbev/whisper.cpp.git
|
||||
$ cd whisper.cpp
|
||||
$ cmake -B build -S .
|
||||
$ cmake --build build --target parakeet-cli -j 12
|
||||
```
|
||||
|
||||
Download the model:
|
||||
```console
|
||||
$ hf download danbev/parakeet parakeet-tdt-0.6b-v3.bin --local-dir models
|
||||
```
|
||||
|
||||
Run:
|
||||
```console
|
||||
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav
|
||||
```
|
||||
|
||||
"""
|
||||
|
||||
api = HfApi()
|
||||
|
||||
def deploy_iteration():
|
||||
create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
|
||||
|
||||
print("Updating Model Card...")
|
||||
api.upload_file(
|
||||
path_or_fileobj=MODEL_CARD_CONTENT.encode(),
|
||||
path_in_repo="README.md",
|
||||
repo_id=REPO_ID,
|
||||
repo_type="model",
|
||||
commit_message="Update README.md"
|
||||
)
|
||||
|
||||
print(f"Uploading {REMOTE_GGUF_NAME}...")
|
||||
api.upload_file(
|
||||
path_or_fileobj=LOCAL_GGUF_PATH,
|
||||
path_in_repo=REMOTE_GGUF_NAME,
|
||||
repo_id=REPO_ID,
|
||||
repo_type="model",
|
||||
commit_message="Upload new parakeet iteration"
|
||||
)
|
||||
|
||||
print(f"\nDeployment successful!")
|
||||
print(f"URL: https://huggingface.co/{REPO_ID}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if os.path.exists(LOCAL_GGUF_PATH):
|
||||
deploy_iteration()
|
||||
else:
|
||||
print(f"Error: {LOCAL_GGUF_PATH} not found.")
|
||||
|
|
@ -109,23 +109,43 @@ add_library(whisper
|
|||
whisper.cpp
|
||||
)
|
||||
|
||||
add_library(parakeet
|
||||
../include/parakeet.h
|
||||
parakeet-arch.h
|
||||
parakeet.cpp
|
||||
)
|
||||
|
||||
target_include_directories(parakeet PUBLIC . ../include)
|
||||
target_compile_features (parakeet PUBLIC cxx_std_11)
|
||||
target_link_libraries(parakeet PUBLIC ggml Threads::Threads)
|
||||
|
||||
# Set the version numbers
|
||||
set_target_properties(whisper PROPERTIES
|
||||
VERSION ${PROJECT_VERSION}
|
||||
SOVERSION ${SOVERSION}
|
||||
)
|
||||
|
||||
set_target_properties(parakeet PROPERTIES
|
||||
VERSION ${PROJECT_VERSION}
|
||||
SOVERSION ${SOVERSION}
|
||||
)
|
||||
|
||||
target_include_directories(whisper PUBLIC . ../include)
|
||||
target_compile_features (whisper PUBLIC cxx_std_11) # don't bump
|
||||
|
||||
if (CMAKE_CXX_BYTE_ORDER STREQUAL "BIG_ENDIAN")
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_BIG_ENDIAN)
|
||||
set(PARAKEET_EXTRA_FLAGS ${PARAKEET_EXTRA_FLAGS} -DPARAKEET_BIG_ENDIAN)
|
||||
endif()
|
||||
|
||||
if (WHISPER_EXTRA_FLAGS)
|
||||
target_compile_options(whisper PRIVATE ${WHISPER_EXTRA_FLAGS})
|
||||
endif()
|
||||
|
||||
if (PARAKEET_EXTRA_FLAGS)
|
||||
target_compile_options(parakeet PRIVATE ${PARAKEET_EXTRA_FLAGS})
|
||||
endif()
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(whisper PUBLIC ggml Threads::Threads)
|
||||
|
||||
|
|
@ -144,4 +164,7 @@ endif()
|
|||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(whisper PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(whisper PRIVATE WHISPER_SHARED WHISPER_BUILD)
|
||||
|
||||
set_target_properties(parakeet PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(parakeet PRIVATE PARAKEET_SHARED PARAKEET_BUILD)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,194 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
enum parakeet_tensor {
|
||||
// Encoder pre_encode
|
||||
PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_OUT_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS,
|
||||
PARAKEET_TENSOR_ENC_PE,
|
||||
|
||||
// Encoder layers (per-layer)
|
||||
PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_FF1_BIAS,
|
||||
PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_CONV_BIAS,
|
||||
PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_CONV_BN_BIAS,
|
||||
PARAKEET_TENSOR_ENC_CONV_BN_MEAN,
|
||||
PARAKEET_TENSOR_ENC_CONV_BN_VAR,
|
||||
PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES,
|
||||
PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS,
|
||||
PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U,
|
||||
PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V,
|
||||
PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_FF2_BIAS,
|
||||
PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT,
|
||||
PARAKEET_TENSOR_ENC_NORM_OUT_BIAS,
|
||||
|
||||
// Prediction network
|
||||
PARAKEET_TENSOR_PRED_EMBED_WEIGHT,
|
||||
PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH,
|
||||
PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH,
|
||||
PARAKEET_TENSOR_PRED_LSTM_BIAS_IH,
|
||||
PARAKEET_TENSOR_PRED_LSTM_BIAS_HH,
|
||||
|
||||
// Joint network
|
||||
PARAKEET_TENSOR_JOINT_PRED_WEIGHT,
|
||||
PARAKEET_TENSOR_JOINT_PRED_BIAS,
|
||||
PARAKEET_TENSOR_JOINT_ENC_WEIGHT,
|
||||
PARAKEET_TENSOR_JOINT_ENC_BIAS,
|
||||
PARAKEET_TENSOR_JOINT_NET_WEIGHT,
|
||||
PARAKEET_TENSOR_JOINT_NET_BIAS,
|
||||
};
|
||||
|
||||
static const std::map<parakeet_tensor, const char *> PARAKEET_TENSOR_NAMES = {
|
||||
// Encoder pre_encode
|
||||
{PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT, "encoder.pre_encode.out.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_OUT_BIAS, "encoder.pre_encode.out.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT, "encoder.pre_encode.conv.0.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS, "encoder.pre_encode.conv.0.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT, "encoder.pre_encode.conv.2.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS, "encoder.pre_encode.conv.2.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT, "encoder.pre_encode.conv.3.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS, "encoder.pre_encode.conv.3.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT, "encoder.pre_encode.conv.5.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS, "encoder.pre_encode.conv.5.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT, "encoder.pre_encode.conv.6.weight"},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS, "encoder.pre_encode.conv.6.bias"},
|
||||
{PARAKEET_TENSOR_ENC_PE, "encoder.pe"},
|
||||
|
||||
// Encoder layers (use %d for layer number)
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT, "encoder.layers.%d.norm_feed_forward1.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF1_BIAS, "encoder.layers.%d.norm_feed_forward1.bias"},
|
||||
{PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT, "encoder.layers.%d.feed_forward1.linear1.weight"},
|
||||
{PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT, "encoder.layers.%d.feed_forward1.linear2.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT, "encoder.layers.%d.norm_conv.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_CONV_BIAS, "encoder.layers.%d.norm_conv.bias"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT, "encoder.layers.%d.conv.pointwise_conv1.weight"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT, "encoder.layers.%d.conv.depthwise_conv.weight"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT, "encoder.layers.%d.conv.batch_norm.weight"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_BIAS, "encoder.layers.%d.conv.batch_norm.bias"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_MEAN, "encoder.layers.%d.conv.batch_norm.running_mean"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_VAR, "encoder.layers.%d.conv.batch_norm.running_var"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES, "encoder.layers.%d.conv.batch_norm.num_batches_tracked"},
|
||||
{PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT, "encoder.layers.%d.conv.pointwise_conv2.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT, "encoder.layers.%d.norm_self_att.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS, "encoder.layers.%d.norm_self_att.bias"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U, "encoder.layers.%d.self_attn.pos_bias_u"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V, "encoder.layers.%d.self_attn.pos_bias_v"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT, "encoder.layers.%d.self_attn.linear_q.weight"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT, "encoder.layers.%d.self_attn.linear_k.weight"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT, "encoder.layers.%d.self_attn.linear_v.weight"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT, "encoder.layers.%d.self_attn.linear_out.weight"},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT, "encoder.layers.%d.self_attn.linear_pos.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT, "encoder.layers.%d.norm_feed_forward2.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF2_BIAS, "encoder.layers.%d.norm_feed_forward2.bias"},
|
||||
{PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT, "encoder.layers.%d.feed_forward2.linear1.weight"},
|
||||
{PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT, "encoder.layers.%d.feed_forward2.linear2.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT, "encoder.layers.%d.norm_out.weight"},
|
||||
{PARAKEET_TENSOR_ENC_NORM_OUT_BIAS, "encoder.layers.%d.norm_out.bias"},
|
||||
|
||||
// Prediction network
|
||||
{PARAKEET_TENSOR_PRED_EMBED_WEIGHT, "decoder.prediction.embed.weight"},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH, "decoder.prediction.dec_rnn.lstm.weight_ih_l%d"},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH, "decoder.prediction.dec_rnn.lstm.weight_hh_l%d"},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_BIAS_IH, "decoder.prediction.dec_rnn.lstm.bias_ih_l%d"},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_BIAS_HH, "decoder.prediction.dec_rnn.lstm.bias_hh_l%d"},
|
||||
|
||||
// Joint network
|
||||
{PARAKEET_TENSOR_JOINT_PRED_WEIGHT, "joint.pred.weight"},
|
||||
{PARAKEET_TENSOR_JOINT_PRED_BIAS, "joint.pred.bias"},
|
||||
{PARAKEET_TENSOR_JOINT_ENC_WEIGHT, "joint.enc.weight"},
|
||||
{PARAKEET_TENSOR_JOINT_ENC_BIAS, "joint.enc.bias"},
|
||||
{PARAKEET_TENSOR_JOINT_NET_WEIGHT, "joint.joint_net.2.weight"},
|
||||
{PARAKEET_TENSOR_JOINT_NET_BIAS, "joint.joint_net.2.bias"},
|
||||
};
|
||||
|
||||
static const std::map<parakeet_tensor, ggml_op> PARAKEET_TENSOR_INFO = {
|
||||
// Encoder pre_encode
|
||||
{PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_PRE_OUT_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_PE, GGML_OP_ADD},
|
||||
|
||||
// Encoder layers
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF1_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF1_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_FF1_LINEAR1_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_FF1_LINEAR2_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_NORM_CONV_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_CONV_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_CONV_PW1_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_CONV_DW_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_MEAN, GGML_OP_SUB},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_VAR, GGML_OP_DIV},
|
||||
{PARAKEET_TENSOR_ENC_CONV_BN_NUM_BATCHES, GGML_OP_NONE},
|
||||
{PARAKEET_TENSOR_ENC_CONV_PW2_WEIGHT, GGML_OP_IM2COL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_ATTN_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_ATTN_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_U, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_BIAS_V, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_Q_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_K_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_V_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_OUT_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_ATTN_POS_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF2_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_FF2_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_ENC_FF2_LINEAR1_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_FF2_LINEAR2_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_ENC_NORM_OUT_WEIGHT, GGML_OP_MUL},
|
||||
{PARAKEET_TENSOR_ENC_NORM_OUT_BIAS, GGML_OP_ADD},
|
||||
|
||||
// Prediction network
|
||||
{PARAKEET_TENSOR_PRED_EMBED_WEIGHT, GGML_OP_GET_ROWS},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_IH, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_WEIGHT_HH, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_BIAS_IH, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_PRED_LSTM_BIAS_HH, GGML_OP_ADD},
|
||||
|
||||
// Joint network
|
||||
{PARAKEET_TENSOR_JOINT_PRED_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_JOINT_PRED_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_JOINT_ENC_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_JOINT_ENC_BIAS, GGML_OP_ADD},
|
||||
{PARAKEET_TENSOR_JOINT_NET_WEIGHT, GGML_OP_MUL_MAT},
|
||||
{PARAKEET_TENSOR_JOINT_NET_BIAS, GGML_OP_ADD},
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -110,3 +110,32 @@ target_compile_definitions(${VAD_TEST} PRIVATE
|
|||
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/jfk.wav")
|
||||
add_test(NAME ${VAD_TEST} COMMAND ${VAD_TEST})
|
||||
set_tests_properties(${VAD_TEST} PROPERTIES LABELS "base;en")
|
||||
|
||||
# Parakeet model loading test
|
||||
set(PARAKEET_TEST test-parakeet)
|
||||
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
|
||||
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
|
||||
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
|
||||
target_compile_definitions(${PARAKEET_TEST} PRIVATE
|
||||
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
|
||||
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/jfk.wav")
|
||||
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
|
||||
|
||||
set(PARAKEET_TEST test-parakeet-stream)
|
||||
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
|
||||
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
|
||||
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
|
||||
target_compile_definitions(${PARAKEET_TEST} PRIVATE
|
||||
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
|
||||
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/gb1.wav")
|
||||
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
|
||||
|
||||
set(PARAKEET_TEST test-parakeet-full)
|
||||
add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
|
||||
target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
|
||||
target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
|
||||
target_compile_definitions(${PARAKEET_TEST} PRIVATE
|
||||
PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
|
||||
SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/gb1.wav")
|
||||
add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
|
||||
set_tests_properties(${PARAKEET_TEST} PROPERTIES LABELS "parakeet;unit")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
#include "parakeet.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
#ifdef NDEBUG
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
|
||||
static bool is_first = true;
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
|
||||
char text_buf[256];
|
||||
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
|
||||
|
||||
int32_t time_ms = token_data->frame_index * 10;
|
||||
|
||||
printf("%s", text_buf);
|
||||
fflush(stdout);
|
||||
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::string model_path = PARAKEET_MODEL_PATH;
|
||||
std::string sample_path = SAMPLE_PATH;
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
|
||||
assert(pcmf32.size() > 0);
|
||||
assert(pcmf32s.size() == 0); // no stereo vector
|
||||
|
||||
printf("Loading Parakeet model from: %s\n", model_path.c_str());
|
||||
|
||||
struct parakeet_context_params ctx_params = parakeet_context_default_params();
|
||||
|
||||
struct parakeet_context * pctx = parakeet_init_from_file_with_params(model_path.c_str(), ctx_params);
|
||||
if (pctx == nullptr) {
|
||||
fprintf(stderr, "Failed to load Parakeet model\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Successfully loaded Parakeet model\n");
|
||||
|
||||
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
|
||||
params.new_token_callback = token_callback;
|
||||
params.new_token_callback_user_data = nullptr;
|
||||
|
||||
params.chunk_length_ms = 10000;
|
||||
params.left_context_ms = 10000;
|
||||
params.right_context_ms = 4960;
|
||||
|
||||
int ret = parakeet_full(pctx, params, pcmf32.data(), pcmf32.size());
|
||||
assert(ret == 0);
|
||||
|
||||
parakeet_free(pctx);
|
||||
|
||||
printf("\nTest passed: parakeet_full_parallel succeeded!\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
#include "parakeet.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
#ifdef NDEBUG
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
|
||||
static bool is_first = true;
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
|
||||
char text_buf[256];
|
||||
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
|
||||
|
||||
int32_t time_ms = token_data->frame_index * 10;
|
||||
|
||||
printf("%s", text_buf);
|
||||
fflush(stdout);
|
||||
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
void segment_callback(parakeet_context * ctx, parakeet_state * state, int n_new, void * user_data) {
|
||||
const int n_segments = parakeet_full_n_segments_from_state(state);
|
||||
const int s0 = n_segments - n_new;
|
||||
|
||||
printf("\nSegment Callback: %d new segment(s)\n", n_new);
|
||||
|
||||
for (int i = s0; i < n_segments; i++) {
|
||||
const char * text = parakeet_full_get_segment_text_from_state(state, i);
|
||||
const int64_t t0 = parakeet_full_get_segment_t0_from_state(state, i);
|
||||
const int64_t t1 = parakeet_full_get_segment_t1_from_state(state, i);
|
||||
|
||||
printf("Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
|
||||
printf("Tokens:\n");
|
||||
|
||||
const int n_tokens = parakeet_full_n_tokens_from_state(state, i);
|
||||
for (int j = 0; j < n_tokens; j++) {
|
||||
parakeet_token_data token_data = parakeet_full_get_token_data_from_state(state, i, j);
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data.id);
|
||||
|
||||
printf(" [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%d \"%s\"\n",
|
||||
j,
|
||||
token_data.id,
|
||||
token_data.frame_index,
|
||||
token_data.duration_idx,
|
||||
token_data.duration_value,
|
||||
token_data.p,
|
||||
token_data.plog,
|
||||
(long long)token_data.t0,
|
||||
(long long)token_data.t1,
|
||||
token_data.is_word_start,
|
||||
token_str);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::string model_path = PARAKEET_MODEL_PATH;
|
||||
std::string sample_path = SAMPLE_PATH;
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
|
||||
assert(pcmf32.size() > 0);
|
||||
|
||||
struct parakeet_context_params ctx_params = parakeet_context_default_params();
|
||||
struct parakeet_context * pctx = parakeet_init_from_file_with_params_no_state(model_path.c_str(), ctx_params);
|
||||
if (pctx == nullptr) { return 1; }
|
||||
|
||||
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
|
||||
params.new_token_callback = token_callback;
|
||||
|
||||
params.left_context_ms = 10000;
|
||||
params.chunk_length_ms = 10000;
|
||||
params.right_context_ms = 4960;
|
||||
|
||||
parakeet_state * state = parakeet_init_state(pctx);
|
||||
|
||||
// initialize streaming state
|
||||
assert(parakeet_stream_init(pctx, state, params) == 0);
|
||||
|
||||
const int samples_batch_size = 1600;
|
||||
int position = 0;
|
||||
|
||||
while (position < (int)pcmf32.size()) {
|
||||
int samples_to_push = std::min(samples_batch_size, (int)pcmf32.size() - position);
|
||||
|
||||
int ret = parakeet_stream_push(pctx, state, pcmf32.data() + position, samples_to_push);
|
||||
assert(ret == 0);
|
||||
|
||||
position += samples_to_push;
|
||||
}
|
||||
|
||||
// flush remaining samples.
|
||||
assert(parakeet_stream_flush(pctx, state) == 0);
|
||||
|
||||
parakeet_free_state(state);
|
||||
parakeet_free(pctx);
|
||||
|
||||
printf("\n\nTest passed: Streaming logic.\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
#include "parakeet.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
#ifdef NDEBUG
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
|
||||
static bool is_first = true;
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
|
||||
char text_buf[256];
|
||||
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
|
||||
|
||||
int32_t time_ms = token_data->frame_index * 10;
|
||||
|
||||
printf("%s", text_buf);
|
||||
fflush(stdout);
|
||||
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
void segment_callback(parakeet_context * ctx, parakeet_state * state, int n_new, void * user_data) {
|
||||
const int n_segments = parakeet_full_n_segments_from_state(state);
|
||||
const int s0 = n_segments - n_new;
|
||||
|
||||
printf("\nSegment Callback: %d new segment(s)\n", n_new);
|
||||
|
||||
for (int i = s0; i < n_segments; i++) {
|
||||
const char * text = parakeet_full_get_segment_text_from_state(state, i);
|
||||
const int64_t t0 = parakeet_full_get_segment_t0_from_state(state, i);
|
||||
const int64_t t1 = parakeet_full_get_segment_t1_from_state(state, i);
|
||||
|
||||
printf("Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
|
||||
printf("Tokens:\n");
|
||||
|
||||
const int n_tokens = parakeet_full_n_tokens_from_state(state, i);
|
||||
for (int j = 0; j < n_tokens; j++) {
|
||||
parakeet_token_data token_data = parakeet_full_get_token_data_from_state(state, i, j);
|
||||
const char * token_str = parakeet_token_to_str(ctx, token_data.id);
|
||||
|
||||
printf(" [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%d \"%s\"\n",
|
||||
j,
|
||||
token_data.id,
|
||||
token_data.frame_index,
|
||||
token_data.duration_idx,
|
||||
token_data.duration_value,
|
||||
token_data.p,
|
||||
token_data.plog,
|
||||
(long long)token_data.t0,
|
||||
(long long)token_data.t1,
|
||||
token_data.is_word_start,
|
||||
token_str);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::string model_path = PARAKEET_MODEL_PATH;
|
||||
std::string sample_path = SAMPLE_PATH;
|
||||
|
||||
// Load the sample audio file
|
||||
std::vector<float> pcmf32;
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
|
||||
assert(pcmf32.size() > 0);
|
||||
assert(pcmf32s.size() == 0);
|
||||
|
||||
printf("Loading Parakeet model from: %s\n", model_path.c_str());
|
||||
|
||||
struct parakeet_context_params ctx_params = parakeet_context_default_params();
|
||||
|
||||
struct parakeet_context * pctx = parakeet_init_from_file_with_params_no_state(model_path.c_str(), ctx_params);
|
||||
if (pctx == nullptr) {
|
||||
fprintf(stderr, "Failed to load Parakeet model\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Successfully loaded Parakeet model\n");
|
||||
|
||||
struct parakeet_full_params params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
|
||||
params.new_token_callback = token_callback;
|
||||
params.new_token_callback_user_data = nullptr;
|
||||
params.new_segment_callback = segment_callback;
|
||||
params.new_segment_callback_user_data = nullptr;
|
||||
parakeet_state * state = parakeet_init_state(pctx);
|
||||
|
||||
int ret = parakeet_chunk(pctx, state, params, pcmf32.data(), pcmf32.size());
|
||||
assert(ret == 0);
|
||||
|
||||
parakeet_free_state(state);
|
||||
parakeet_free(pctx);
|
||||
|
||||
printf("\nTest passed: Parakeet model loaded and freed successfully\n");
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue