Merge 988a4af6f1 into fc674574ca

2026-04-23 18:51:45 +00:00 · 2026-04-23 18:51:45 +00:00 · 0eb40f1138
parent fc674574ca 988a4af6f1
commit 0eb40f1138
6 changed files with 365 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -91,6 +91,7 @@ endif()
 option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
 option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
+option(WHISPER_VITISAI               "whisper: support for AMD Vitis AI"  OFF)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
--- a/README.md
+++ b/README.md
@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
+- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
 - [OpenVINO Support](#openvino-support)
 - [Ascend NPU Support](#ascend-npu-support)
 - [Moore Threads GPU Support](#moore-threads-gpu-support)
@ -312,6 +313,48 @@ This can result in significant speedup in encoder performance. Here are the inst

 For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).

+## AMD Ryzen™ AI support for NPU
+
+On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
+> **Note:**  
+> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.  
+> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
+
+### Setup environment (Windows only)
+
+- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
+- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=flexmlrt1.7.0-win.zip).
+- **Environment:** Extract the runtime package and set up the environment:
+  ```powershell
+  tar xvf flexmlrt1.7.0-win.zip
+  flexmlrt\setup.bat
+  ```
+Your environment is now ready.
+
+### Build Whisper.cpp for Ryzen™ AI support
+
+```bash
+cmake -B build -DWHISPER_VITISAI=1
+cmake --build build -j --config Release
+```
+
+### Download NPU-optimized models
+
+- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:  
+  https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
+- Download the pre-compiled `.rai` cache file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
+  The cache file must be named with the `-encoder-vitisai.rai` suffix. For example, if your model file is named `ggml-small.bin`, the cache file should be named `ggml-small-encoder-vitisai.rai`.
+
+
+> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
+
+Run the examples as usual:
+
+```bash
+./build/bin/whisper-cli -m models/ggml-small.bin -f samples/jfk.wav
+```
+
+
 ## NVIDIA GPU support

 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()

+if (WHISPER_VITISAI)
+    find_package(FlexmlRT REQUIRED)
+endif()
+
 #
 # libraries
 #
@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()

+if (WHISPER_VITISAI)
+    set(TARGET whisper.vitisai)
+
+    add_library(${TARGET} OBJECT
+        vitisai/whisper-vitisai-encoder.h
+        vitisai/whisper-vitisai-encoder.cpp
+        )
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
+
+    # Add C++17 standard for MSVC
+    if (MSVC)
+        target_compile_options(${TARGET} PRIVATE /std:c++17)
+    endif()
+
+    target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
+    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+endif()
+
 # whisper

 add_library(whisper
@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
    target_link_libraries(whisper PRIVATE whisper.openvino)
 endif()

+if (WHISPER_VITISAI)
+    target_link_libraries(whisper PRIVATE whisper.vitisai)
+endif()
+
 if (WHISPER_MKL)
    target_link_libraries(whisper PRIVATE MKL::MKL)
 endif()
--- a/src/vitisai/whisper-vitisai-encoder.cpp
+++ b/src/vitisai/whisper-vitisai-encoder.cpp
@ -0,0 +1,204 @@
+#include "vitisai/whisper-vitisai-encoder.h"
+#include "FlexMLClient.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstdio>
+#include <cstdlib>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/mman.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+#endif
+#include <cstring>
+#include <string>
+
+struct whisper_vitisai_context {
+    std::string model_path;
+    std::shared_ptr<flexmlrt::client::Model> runner;
+    uint8_t * fbs_buffer;
+    size_t fbs_buffer_size;
+};
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+static bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
+#ifdef _WIN32
+    // Open the file
+    HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    LARGE_INTEGER fileSize;
+    if (!GetFileSizeEx(hFile, &fileSize)) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Create a file mapping object
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
+    if (hMapping == NULL) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Map the file
+    *buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
+    if (*buffer == NULL) {
+        CloseHandle(hMapping);
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = fileSize.QuadPart;
+    return true;
+#else
+    // Open the file
+    FILE * fd = fopen(path, "rb");
+    if (!fd) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    struct stat st;
+    if (fstat(fileno(fd), &st) == -1) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Mmap the file
+    *buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
+    if (*buffer == MAP_FAILED) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = st.st_size;
+    return true;
+#endif // _WIN32
+}
+
+static void unmap_rai_file(uint8_t * buffer, size_t size) {
+#ifdef _WIN32
+    UnmapViewOfFile(buffer);
+#else
+    munmap(buffer, size);
+#endif // _WIN32
+}
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
+    if (!path_model) {
+        std::fprintf(stderr, "%s: path_model is null\n", __func__);
+        return nullptr;
+    }
+
+    auto * ctx = new whisper_vitisai_context;
+    ctx->model_path = path_model;
+
+    // Override the model path with the environment variable if it is set
+    if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
+        if (env_model_path[0] != '\0') {
+            ctx->model_path = env_model_path;
+        }
+    }
+
+    // Step 1: Set up the model
+    flexmlrt::client::Options options;
+    options.modelPath = ctx->model_path;
+    options.deviceName = "stx";
+    options.debug = false;
+    options.executeMode = 2;
+    options.extOptions["enable_preemption"] = true;
+
+    // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
+    if (ctx->model_path.find(".rai") != std::string::npos) {
+        // mmap rai file for both Linux and Windows and pass the buffer to the options
+        ctx->fbs_buffer = nullptr;
+        ctx->fbs_buffer_size = 0;
+        if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
+            options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
+            options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
+            options.subgraphName = "vaiml_par_0";
+            options.extOptions["cache_dir"] = std::string(".");
+        } else {
+            std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
+            delete ctx;
+            return nullptr;
+        }
+    }
+
+    try {
+        ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
+
+        if (!ctx->runner->good()) {
+            throw std::runtime_error("Runner creation ran into an error");
+        }
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
+        delete ctx;
+        return nullptr;
+    }
+    return ctx;
+}
+
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
+    if (!ctx) {
+        return;
+    }
+
+    std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
+    if (ctx->fbs_buffer) {
+        unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
+    }
+    delete ctx;
+}
+
+int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
+    if (!ctx || !mel || !out) {
+        std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
+        return 0;
+    }
+
+    if (ggml_n_dims(mel) != 2) {
+        std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
+        return 0;
+    }
+
+    if (ggml_n_dims(out) != 2) {
+        std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
+        return 0;
+    }
+
+    // setup input and output tensors for Vitis AI model
+    std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
+    auto model = ctx->runner;
+
+    // Get tensors as CPU tensors (hwTensor = false)
+    input_tensors = model->getIOTensors("input", false);
+    output_tensors = model->getIOTensors("output", false);
+
+    // TODO: add assert checks for tensor numbers and shapes
+
+    input_tensors[0].data = mel->data;
+    output_tensors[0].data = out->data;
+
+    try {
+        model->forward(input_tensors, output_tensors);
+#if defined(WHISPER_DEBUG)
+        std::fprintf(stderr, "%s: Vitis AI model inference completed.\n", __func__);
+#endif
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
+        return 0;
+    }
+
+    return 1;
+}
--- a/src/vitisai/whisper-vitisai-encoder.h
+++ b/src/vitisai/whisper-vitisai-encoder.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdbool>
+#include <cstdint>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_vitisai_context;
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
+
+struct ggml_tensor;
+
+int whisper_vitisai_encode(
+    struct whisper_vitisai_context * ctx,
+    struct ggml_tensor * mel,
+    struct ggml_tensor * out);
+
+#if __cplusplus
+}
+#endif
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -14,6 +14,10 @@
 #include "openvino/whisper-openvino-encoder.h"
 #endif

+#ifdef WHISPER_USE_VITISAI
+#include "vitisai/whisper-vitisai-encoder.h"
+#endif
+
 #include <atomic>
 #include <algorithm>
 #include <cassert>
@ -903,6 +907,10 @@ struct whisper_state {
    whisper_openvino_context * ctx_openvino = nullptr;
 #endif

+#ifdef WHISPER_USE_VITISAI
+    whisper_vitisai_context * ctx_vitisai = nullptr;
+#endif
+
    // [EXPERIMENTAL] token-level timestamps data
    int64_t t_beg  = 0;
    int64_t t_last = 0;
@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
    const bool use_openvino = wstate.ctx_openvino != nullptr;
 #endif

-    return use_coreml || use_openvino;
+#ifndef WHISPER_USE_VITISAI
+    const bool use_vitisai = false;
+#else
+    const bool use_vitisai = wstate.ctx_vitisai != nullptr;
+#endif
+
+    return use_coreml || use_openvino || use_vitisai;
 }

 static struct ggml_cgraph * whisper_build_graph_conv(
@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(

 #if defined(WHISPER_USE_COREML)
            whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
+#elif defined(WHISPER_USE_VITISAI)
+            whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
 #elif defined(WHISPER_USE_OPENVINO)
            whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
 #endif
@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 }
 #endif

+#ifdef WHISPER_USE_VITISAI
+// replace extension with Vitis AI encoder artifact
+static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder-vitisai.rai";
+
+    return path_bin;
+}
+#endif
+
 #ifdef WHISPER_USE_OPENVINO
 // replace .bin with-encoder-openvino.xml
 static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
    }
 #endif

+#ifdef WHISPER_USE_VITISAI
+    const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
+
+    state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
+    if (!state->ctx_vitisai) {
+        WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
+        whisper_free_state(state);
+        return nullptr;
+    } else {
+        WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
+    }
+#endif
+
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);

    state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
        }
 #endif

+#ifdef WHISPER_USE_VITISAI
+        if (state->ctx_vitisai != nullptr) {
+            whisper_vitisai_free(state->ctx_vitisai);
+            state->ctx_vitisai = nullptr;
+        }
+#endif
+
        whisper_batch_free(state->batch);

        ggml_backend_sched_free(state->sched_conv.sched);
@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
 #endif
 }

+static int whisper_has_vitisai(void) {
+#ifdef WHISPER_USE_VITISAI
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 const char * whisper_print_system_info(void) {
    static std::string s;

    s  = "";
    s += "WHISPER : ";
+    s += "VITISAI = "   + std::to_string(whisper_has_vitisai())    + " | ";
    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";