Add VitisAI Plugin

* Added VitisAI encoder module placeholder files * VitisAI build integration * VitisAI encoder offload functional * Clean up vitisai integration * Add c++17 requirement for Windows * Enabled preemption for windows runs * Add model cache override option * Remove vitisai premature log message * Add rai support through file mapping * Fixed flatbuffer loading * Fixed Windows file mapping issue * Update FlexmlRT resolution * Use Flexmlrt wheel pkg to build VitisAI plugin * Clean up * Remove prints * Change flexmlrt target from Shared to Interface * Add c++17 requirement for Windows * Enabled preemption for windows runs * Add rai support through file mapping * Fixed flatbuffer loading * Fixed Windows file mapping issue * Update FlexmlRT resolution * Use Flexmlrt wheel pkg to build VitisAI plugin * Clean up * Remove prints * Change flexmlrt target from Shared to Interface * Cleanup FlexmlRT integration * format fix * Adding AMD Licenses * Update CMakeLists.txt Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com> * Update src/CMakeLists.txt Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com> * Update whisper.cpp * Added VitisAI encoder readme section * Remove license headers from common files to whisper.cpp --------- Co-authored-by: Sachin Kumawat <sachink@amd.com> Co-authored-by: Jeff Lin <jeffylin@xilinx.com> Co-authored-by: Lin <jefflin@amd.com> Co-authored-by: Lin, Jeff (DCG-ENG) <jeff.lin@amd.com> Co-authored-by: Iswarya Alex <iswaryaalex96@gmail.com> Co-authored-by: Alex, Iswarya <Iswarya.Alex@amd.com>
2026-01-13 14:14:27 -08:00 · 2026-01-13 14:14:27 -08:00 · 66e882aeed
parent a96310871a
commit 66e882aeed
6 changed files with 358 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -91,6 +91,7 @@ endif()
 option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
 option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
+option(WHISPER_VITISAI               "whisper: support for AMD Vitis AI"  OFF)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
--- a/README.md
+++ b/README.md
@ -312,6 +312,35 @@ This can result in significant speedup in encoder performance. Here are the inst

 For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).

+## VitisAI encoder support
+
+On AMD Ryzen AI NPU devices, you can run the Encoder via the VitisAI plugin to significantly accelerate the whisper models.
+
+- Prepare the AMD runtime packages (required before building):
+
+  - Obtain the XRT package and the FlexmlRT package from AMD. Both are distributed as tarballs or wheels.
+  - Copy the downloaded archives to a local path, extract them, and run the setup script from each extracted package in your shell (for example `source /path/to/xrt/setup.sh` and `source /path/to/flexmlrt/setup.sh`). Run these in every new shell you use to build or run `whisper.cpp`.
+
+- Fetch the prebuilt VitisAI encoder cache:
+
+  - Download the appropriate Whisper encoder `.rai` cache for your model size from the AMD collection on Hugging Face: https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
+  - Place and rename the downloaded `.rai` file as `<gguf_model>-encoder-vitisai.rai` alongside your ggml model files `<gguf_model>.bin`.
+
+- Build `whisper.cpp` with VitisAI support:
+
+  ```bash
+  cmake -B build -DWHISPER_VITISAI=1
+  cmake --build build -j --config Release
+  ```
+
+- Run the examples as usual. For example:
+
+  ```text
+  $ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```
+
+The VitisAI artifact from Huggingface is already optimized for Ryzen AI NPUs, there is no slow compilation needed. The acceleration advantage should be seen from first run itself apart from CPU caching overheads.
+
 ## NVIDIA GPU support

 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()

+if (WHISPER_VITISAI)
+    find_package(FlexmlRT REQUIRED)
+endif()
+
 #
 # libraries
 #
@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()

+if (WHISPER_VITISAI)
+    set(TARGET whisper.vitisai)
+
+    add_library(${TARGET} OBJECT
+        vitisai/whisper-vitisai-encoder.h
+        vitisai/whisper-vitisai-encoder.cpp
+        )
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
+
+    # Add C++17 standard for MSVC
+    if (MSVC)
+        target_compile_options(${TARGET} PRIVATE /std:c++17)
+    endif()
+
+    target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
+    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+endif()
+
 # whisper

 add_library(whisper
@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
    target_link_libraries(whisper PRIVATE whisper.openvino)
 endif()

+if (WHISPER_VITISAI)
+    target_link_libraries(whisper PRIVATE whisper.vitisai)
+endif()
+
 if (WHISPER_MKL)
    target_link_libraries(whisper PRIVATE MKL::MKL)
 endif()
--- a/src/vitisai/whisper-vitisai-encoder.cpp
+++ b/src/vitisai/whisper-vitisai-encoder.cpp
@ -0,0 +1,204 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#include "vitisai/whisper-vitisai-encoder.h"
+#include "FlexMLClient.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstdio>
+#include <cstdlib>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/mman.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+#endif
+#include <cstring>
+#include <string>
+
+struct whisper_vitisai_context {
+    std::string model_path;
+    std::shared_ptr<flexmlrt::client::Model> runner;
+    uint8_t * fbs_buffer;
+    size_t fbs_buffer_size;
+};
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
+#ifdef _WIN32
+    // Open the file
+    HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    LARGE_INTEGER fileSize;
+    if (!GetFileSizeEx(hFile, &fileSize)) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Create a file mapping object
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
+    if (hMapping == NULL) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Map the file
+    *buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
+    if (*buffer == NULL) {
+        CloseHandle(hMapping);
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = fileSize.QuadPart;
+    return true;
+#else
+    // Open the file
+    FILE * fd = fopen(path, "rb");
+    if (!fd) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    struct stat st;
+    if (fstat(fileno(fd), &st) == -1) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Mmap the file
+    *buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
+    if (*buffer == MAP_FAILED) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = st.st_size;
+    return true;
+#endif // _WIN32
+}
+
+void unmap_rai_file(uint8_t * buffer, size_t size) {
+#ifdef _WIN32
+    UnmapViewOfFile(buffer);
+#else
+    munmap(buffer, size);
+#endif // _WIN32
+}
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
+    if (!path_model) {
+        std::fprintf(stderr, "%s: path_model is null\n", __func__);
+        return nullptr;
+    }
+
+    auto * ctx = new whisper_vitisai_context;
+    ctx->model_path = path_model;
+
+    // Override the model path with the environment variable if it is set
+    if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
+        if (env_model_path[0] != '\0') {
+            ctx->model_path = env_model_path;
+        }
+    }
+
+    // Step 1: Set up the model
+    flexmlrt::client::Options options;
+    options.modelPath = ctx->model_path;
+    options.deviceName = "stx";
+    options.debug = false;
+    options.executeMode = 2;
+    options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
+    options.extOptions["enable_preemption"] = true;
+
+    // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
+    if (ctx->model_path.find(".rai") != std::string::npos) {
+        // mmap rai file for both Linux and Windows and pass the buffer to the options
+        ctx->fbs_buffer = nullptr;
+        ctx->fbs_buffer_size = 0;
+        if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
+            options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
+            options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
+            options.subgraphName = "vaiml_par_0";
+            options.extOptions["cache_dir"] = std::string(".");
+        } else {
+            std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
+            delete ctx;
+            return nullptr;
+        }
+    }
+
+    try {
+        ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
+
+        if (!ctx->runner->good()) {
+            throw std::runtime_error("Runner creation ran into an error");
+        }
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
+        delete ctx;
+        return nullptr;
+    }
+    return ctx;
+}
+
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
+    if (!ctx) {
+        return;
+    }
+
+    std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
+    if (ctx->fbs_buffer) {
+        unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
+    }
+    delete ctx;
+}
+
+int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
+    if (!ctx || !mel || !out) {
+        std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
+        return 0;
+    }
+
+    if (ggml_n_dims(mel) != 2) {
+        std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
+        return 0;
+    }
+
+    if (ggml_n_dims(out) != 2) {
+        std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
+        return 0;
+    }
+
+    // setup input and output tensors for Vitis AI model
+    std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
+    auto model = ctx->runner;
+
+    // Get tensors as CPU tensors (hwTensor = false)
+    input_tensors = model->getIOTensors("input", false);
+    output_tensors = model->getIOTensors("output", false);
+
+    // TODO: add assert checks for tensor numbers and shapes
+
+    input_tensors[0].data = mel->data;
+    output_tensors[0].data = out->data;
+
+    try {
+        model->forward(input_tensors, output_tensors);
+        std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
+        return 0;
+    }
+
+    return 1;
+}
--- a/src/vitisai/whisper-vitisai-encoder.h
+++ b/src/vitisai/whisper-vitisai-encoder.h
@ -0,0 +1,32 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdbool>
+#include <cstdint>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_vitisai_context;
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
+// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
+void unmap_rai_file(uint8_t * buffer, size_t size);
+
+struct ggml_tensor;
+
+int whisper_vitisai_encode(
+    struct whisper_vitisai_context * ctx,
+    struct ggml_tensor * mel,
+    struct ggml_tensor * out);
+
+#if __cplusplus
+}
+#endif
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -14,6 +14,10 @@
 #include "openvino/whisper-openvino-encoder.h"
 #endif

+#ifdef WHISPER_USE_VITISAI
+#include "vitisai/whisper-vitisai-encoder.h"
+#endif
+
 #include <atomic>
 #include <algorithm>
 #include <cassert>
@ -903,6 +907,10 @@ struct whisper_state {
    whisper_openvino_context * ctx_openvino = nullptr;
 #endif

+#ifdef WHISPER_USE_VITISAI
+    whisper_vitisai_context * ctx_vitisai = nullptr;
+#endif
+
    // [EXPERIMENTAL] token-level timestamps data
    int64_t t_beg  = 0;
    int64_t t_last = 0;
@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
    const bool use_openvino = wstate.ctx_openvino != nullptr;
 #endif

-    return use_coreml || use_openvino;
+#ifndef WHISPER_USE_VITISAI
+    const bool use_vitisai = false;
+#else
+    const bool use_vitisai = wstate.ctx_vitisai != nullptr;
+#endif
+
+    return use_coreml || use_openvino || use_vitisai;
 }

 static struct ggml_cgraph * whisper_build_graph_conv(
@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(

 #if defined(WHISPER_USE_COREML)
            whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
+#elif defined(WHISPER_USE_VITISAI)
+            whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
 #elif defined(WHISPER_USE_OPENVINO)
            whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
 #endif
@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 }
 #endif

+#ifdef WHISPER_USE_VITISAI
+// replace extension with Vitis AI encoder artifact
+static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder-vitisai.rai";
+
+    return path_bin;
+}
+#endif
+
 #ifdef WHISPER_USE_OPENVINO
 // replace .bin with-encoder-openvino.xml
 static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
    }
 #endif

+#ifdef WHISPER_USE_VITISAI
+    const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
+
+    state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
+    if (!state->ctx_vitisai) {
+        WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
+        whisper_free_state(state);
+        return nullptr;
+    } else {
+        WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
+    }
+#endif
+
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);

    state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
        }
 #endif

+#ifdef WHISPER_USE_VITISAI
+        if (state->ctx_vitisai != nullptr) {
+            whisper_vitisai_free(state->ctx_vitisai);
+            state->ctx_vitisai = nullptr;
+        }
+#endif
+
        whisper_batch_free(state->batch);

        ggml_backend_sched_free(state->sched_conv.sched);
@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
 #endif
 }

+static int whisper_has_vitisai(void) {
+#ifdef WHISPER_USE_VITISAI
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 const char * whisper_print_system_info(void) {
    static std::string s;

    s  = "";
    s += "WHISPER : ";
+    s += "VITISAI = "   + std::to_string(whisper_has_vitisai())    + " | ";
    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";