diff --git a/CMakeLists.txt b/CMakeLists.txt index a0f74041..d4dc3180 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,7 @@ endif() option(WHISPER_COREML "whisper: enable Core ML framework" OFF) option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF) option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF) +option(WHISPER_VITISAI "whisper: support for AMD Vitis AI" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) diff --git a/README.md b/README.md index 474a1301..5a69a161 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp - [Vulkan support](#vulkan-gpu-support) - Support for CPU-only inference - [Efficient GPU support for NVIDIA](#nvidia-gpu-support) +- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu) - [OpenVINO Support](#openvino-support) - [Ascend NPU Support](#ascend-npu-support) - [Moore Threads GPU Support](#moore-threads-gpu-support) @@ -312,6 +313,48 @@ This can result in significant speedup in encoder performance. Here are the inst For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037). +## AMD Ryzen™ AI support for NPU + +On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only. +> **Note:** +> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases. +> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/). + +### Setup environment (Windows only) + +- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip) +- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=flexmlrt1.7.0-win.zip). +- **Environment:** Extract the runtime package and set up the environment: + ```powershell + tar xvf flexmlrt1.7.0-win.zip + flexmlrt\setup.bat + ``` +Your environment is now ready. + +### Build Whisper.cpp for Ryzen™ AI support + +```bash +cmake -B build -DWHISPER_VITISAI=1 +cmake --build build -j --config Release +``` + +### Download NPU-optimized models + +- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection: + https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models +- Download the pre-compiled `.rai` cache file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file. + The cache file must be named with the `-encoder-vitisai.rai` suffix. For example, if your model file is named `ggml-small.bin`, the cache file should be named `ggml-small-encoder-vitisai.rai`. + + +> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead). + +Run the examples as usual: + +```bash +./build/bin/whisper-cli -m models/ggml-small.bin -f samples/jfk.wav +``` + + ## NVIDIA GPU support With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 095a2791..6cba1c6e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -48,6 +48,10 @@ if (WHISPER_OPENVINO) find_package(OpenVINO REQUIRED COMPONENTS Runtime) endif() +if (WHISPER_VITISAI) + find_package(FlexmlRT REQUIRED) +endif() + # # libraries # @@ -101,6 +105,30 @@ if (WHISPER_OPENVINO) set_target_properties(${TARGET} PROPERTIES FOLDER "libs") endif() +if (WHISPER_VITISAI) + set(TARGET whisper.vitisai) + + add_library(${TARGET} OBJECT + vitisai/whisper-vitisai-encoder.h + vitisai/whisper-vitisai-encoder.cpp + ) + + target_include_directories(${TARGET} PUBLIC + . + ) + + set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI) + + # Add C++17 standard for MSVC + if (MSVC) + target_compile_options(${TARGET} PRIVATE /std:c++17) + endif() + + target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt) + set_target_properties(${TARGET} PROPERTIES FOLDER "libs") +endif() + # whisper add_library(whisper @@ -137,6 +165,10 @@ if (WHISPER_OPENVINO) target_link_libraries(whisper PRIVATE whisper.openvino) endif() +if (WHISPER_VITISAI) + target_link_libraries(whisper PRIVATE whisper.vitisai) +endif() + if (WHISPER_MKL) target_link_libraries(whisper PRIVATE MKL::MKL) endif() diff --git a/src/vitisai/whisper-vitisai-encoder.cpp b/src/vitisai/whisper-vitisai-encoder.cpp new file mode 100644 index 00000000..580bcfe3 --- /dev/null +++ b/src/vitisai/whisper-vitisai-encoder.cpp @@ -0,0 +1,204 @@ +#include "vitisai/whisper-vitisai-encoder.h" +#include "FlexMLClient.h" +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#ifdef _WIN32 + #include +#else + #include + #include + #include +#endif +#include +#include + +struct whisper_vitisai_context { + std::string model_path; + std::shared_ptr runner; + uint8_t * fbs_buffer; + size_t fbs_buffer_size; +}; + +// Function to mmap rai file for Linux and MapViewOfFile for Windows +static bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) { +#ifdef _WIN32 + // Open the file + HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile == INVALID_HANDLE_VALUE) { + std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path); + return false; + } + + // Get the file size + LARGE_INTEGER fileSize; + if (!GetFileSizeEx(hFile, &fileSize)) { + CloseHandle(hFile); + std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path); + return false; + } + + // Create a file mapping object + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL); + if (hMapping == NULL) { + CloseHandle(hFile); + std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path); + return false; + } + + // Map the file + *buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart); + if (*buffer == NULL) { + CloseHandle(hMapping); + CloseHandle(hFile); + std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path); + return false; + } + *size = fileSize.QuadPart; + return true; +#else + // Open the file + FILE * fd = fopen(path, "rb"); + if (!fd) { + std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path); + return false; + } + + // Get the file size + struct stat st; + if (fstat(fileno(fd), &st) == -1) { + fclose(fd); + std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path); + return false; + } + + // Mmap the file + *buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0); + if (*buffer == MAP_FAILED) { + fclose(fd); + std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path); + return false; + } + *size = st.st_size; + return true; +#endif // _WIN32 +} + +static void unmap_rai_file(uint8_t * buffer, size_t size) { +#ifdef _WIN32 + UnmapViewOfFile(buffer); +#else + munmap(buffer, size); +#endif // _WIN32 +} + +struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) { + if (!path_model) { + std::fprintf(stderr, "%s: path_model is null\n", __func__); + return nullptr; + } + + auto * ctx = new whisper_vitisai_context; + ctx->model_path = path_model; + + // Override the model path with the environment variable if it is set + if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) { + if (env_model_path[0] != '\0') { + ctx->model_path = env_model_path; + } + } + + // Step 1: Set up the model + flexmlrt::client::Options options; + options.modelPath = ctx->model_path; + options.deviceName = "stx"; + options.debug = false; + options.executeMode = 2; + options.extOptions["enable_preemption"] = true; + + // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options + if (ctx->model_path.find(".rai") != std::string::npos) { + // mmap rai file for both Linux and Windows and pass the buffer to the options + ctx->fbs_buffer = nullptr; + ctx->fbs_buffer_size = 0; + if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) { + options.extOptions["fbs_buffer"] = ctx->fbs_buffer; + options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size; + options.subgraphName = "vaiml_par_0"; + options.extOptions["cache_dir"] = std::string("."); + } else { + std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str()); + delete ctx; + return nullptr; + } + } + + try { + ctx->runner = std::make_shared(options); + + if (!ctx->runner->good()) { + throw std::runtime_error("Runner creation ran into an error"); + } + } catch (const std::exception & e) { + std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what()); + delete ctx; + return nullptr; + } + return ctx; +} + +void whisper_vitisai_free(struct whisper_vitisai_context * ctx) { + if (!ctx) { + return; + } + + std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str()); + if (ctx->fbs_buffer) { + unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size); + } + delete ctx; +} + +int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) { + if (!ctx || !mel || !out) { + std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__); + return 0; + } + + if (ggml_n_dims(mel) != 2) { + std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel)); + return 0; + } + + if (ggml_n_dims(out) != 2) { + std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out)); + return 0; + } + + // setup input and output tensors for Vitis AI model + std::vector input_tensors, output_tensors; + auto model = ctx->runner; + + // Get tensors as CPU tensors (hwTensor = false) + input_tensors = model->getIOTensors("input", false); + output_tensors = model->getIOTensors("output", false); + + // TODO: add assert checks for tensor numbers and shapes + + input_tensors[0].data = mel->data; + output_tensors[0].data = out->data; + + try { + model->forward(input_tensors, output_tensors); +#if defined(WHISPER_DEBUG) + std::fprintf(stderr, "%s: Vitis AI model inference completed.\n", __func__); +#endif + } catch (const std::exception & e) { + std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what()); + return 0; + } + + return 1; +} diff --git a/src/vitisai/whisper-vitisai-encoder.h b/src/vitisai/whisper-vitisai-encoder.h new file mode 100644 index 00000000..840ce694 --- /dev/null +++ b/src/vitisai/whisper-vitisai-encoder.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + +#if __cplusplus +extern "C" { +#endif + +struct whisper_vitisai_context; + +struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model); +void whisper_vitisai_free(struct whisper_vitisai_context * ctx); + +struct ggml_tensor; + +int whisper_vitisai_encode( + struct whisper_vitisai_context * ctx, + struct ggml_tensor * mel, + struct ggml_tensor * out); + +#if __cplusplus +} +#endif diff --git a/src/whisper.cpp b/src/whisper.cpp index 2f356da0..c930b68e 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -14,6 +14,10 @@ #include "openvino/whisper-openvino-encoder.h" #endif +#ifdef WHISPER_USE_VITISAI +#include "vitisai/whisper-vitisai-encoder.h" +#endif + #include #include #include @@ -903,6 +907,10 @@ struct whisper_state { whisper_openvino_context * ctx_openvino = nullptr; #endif +#ifdef WHISPER_USE_VITISAI + whisper_vitisai_context * ctx_vitisai = nullptr; +#endif + // [EXPERIMENTAL] token-level timestamps data int64_t t_beg = 0; int64_t t_last = 0; @@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) { const bool use_openvino = wstate.ctx_openvino != nullptr; #endif - return use_coreml || use_openvino; +#ifndef WHISPER_USE_VITISAI + const bool use_vitisai = false; +#else + const bool use_vitisai = wstate.ctx_vitisai != nullptr; +#endif + + return use_coreml || use_openvino || use_vitisai; } static struct ggml_cgraph * whisper_build_graph_conv( @@ -2411,6 +2425,8 @@ static bool whisper_encode_internal( #if defined(WHISPER_USE_COREML) whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data); +#elif defined(WHISPER_USE_VITISAI) + whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc); #elif defined(WHISPER_USE_OPENVINO) whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc); #endif @@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) { } #endif +#ifdef WHISPER_USE_VITISAI +// replace extension with Vitis AI encoder artifact +static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += "-encoder-vitisai.rai"; + + return path_bin; +} +#endif + #ifdef WHISPER_USE_OPENVINO // replace .bin with-encoder-openvino.xml static std::string whisper_openvino_get_path_encoder(std::string path_bin) { @@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { } #endif +#ifdef WHISPER_USE_VITISAI + const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model); + + state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str()); + if (!state->ctx_vitisai) { + WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str()); + whisper_free_state(state); + return nullptr; + } else { + WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__); + } +#endif + state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx); state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS); @@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) { } #endif +#ifdef WHISPER_USE_VITISAI + if (state->ctx_vitisai != nullptr) { + whisper_vitisai_free(state->ctx_vitisai); + state->ctx_vitisai = nullptr; + } +#endif + whisper_batch_free(state->batch); ggml_backend_sched_free(state->sched_conv.sched); @@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) { #endif } +static int whisper_has_vitisai(void) { +#ifdef WHISPER_USE_VITISAI + return 1; +#else + return 0; +#endif +} + const char * whisper_print_system_info(void) { static std::string s; s = ""; s += "WHISPER : "; + s += "VITISAI = " + std::to_string(whisper_has_vitisai()) + " | "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";