From 66e882aeedf387614316e0c94f7d59a815766c9b Mon Sep 17 00:00:00 2001
From: "Kumawat, Sachin" <sachin.kumawat@amd.com>
Date: Tue, 13 Jan 2026 14:14:27 -0800
Subject: [PATCH 1/4] Add VitisAI Plugin

* Added VitisAI encoder module placeholder files

* VitisAI build integration

* VitisAI encoder offload functional

* Clean up vitisai integration

* Add c++17 requirement for Windows

* Enabled preemption for windows runs

* Add model cache override option

* Remove vitisai premature log message

* Add rai support through file mapping

* Fixed flatbuffer loading

* Fixed Windows file mapping issue

* Update FlexmlRT resolution

* Use Flexmlrt wheel pkg to build VitisAI plugin

* Clean up

* Remove prints

* Change flexmlrt target from Shared to Interface

* Add c++17 requirement for Windows

* Enabled preemption for windows runs

* Add rai support through file mapping

* Fixed flatbuffer loading

* Fixed Windows file mapping issue

* Update FlexmlRT resolution

* Use Flexmlrt wheel pkg to build VitisAI plugin

* Clean up

* Remove prints

* Change flexmlrt target from Shared to Interface

* Cleanup FlexmlRT integration

* format fix

* Adding AMD Licenses

* Update CMakeLists.txt

Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com>

* Update src/CMakeLists.txt

Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com>

* Update whisper.cpp

* Added VitisAI encoder readme section

* Remove license headers from common files to whisper.cpp

---------

Co-authored-by: Sachin Kumawat <sachink@amd.com>
Co-authored-by: Jeff Lin <jeffylin@xilinx.com>
Co-authored-by: Lin <jefflin@amd.com>
Co-authored-by: Lin, Jeff (DCG-ENG) <jeff.lin@amd.com>
Co-authored-by: Iswarya Alex <iswaryaalex96@gmail.com>
Co-authored-by: Alex, Iswarya <Iswarya.Alex@amd.com>
---
 CMakeLists.txt                          |   1 +
 README.md                               |  29 ++++
 src/CMakeLists.txt                      |  32 ++++
 src/vitisai/whisper-vitisai-encoder.cpp | 204 ++++++++++++++++++++++++
 src/vitisai/whisper-vitisai-encoder.h   |  32 ++++
 src/whisper.cpp                         |  61 ++++++-
 6 files changed, 358 insertions(+), 1 deletion(-)
 create mode 100644 src/vitisai/whisper-vitisai-encoder.cpp
 create mode 100644 src/vitisai/whisper-vitisai-encoder.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b60bb045..a8c7347a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,6 +91,7 @@ endif()
 option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
 option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
+option(WHISPER_VITISAI               "whisper: support for AMD Vitis AI"  OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
diff --git a/README.md b/README.md
index 6d4988e6..0369f142 100644
--- a/README.md
+++ b/README.md
@@ -312,6 +312,35 @@ This can result in significant speedup in encoder performance. Here are the inst
 
 For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
 
+## VitisAI encoder support
+
+On AMD Ryzen AI NPU devices, you can run the Encoder via the VitisAI plugin to significantly accelerate the whisper models.
+
+- Prepare the AMD runtime packages (required before building):
+
+  - Obtain the XRT package and the FlexmlRT package from AMD. Both are distributed as tarballs or wheels.
+  - Copy the downloaded archives to a local path, extract them, and run the setup script from each extracted package in your shell (for example `source /path/to/xrt/setup.sh` and `source /path/to/flexmlrt/setup.sh`). Run these in every new shell you use to build or run `whisper.cpp`.
+
+- Fetch the prebuilt VitisAI encoder cache:
+
+  - Download the appropriate Whisper encoder `.rai` cache for your model size from the AMD collection on Hugging Face: https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
+  - Place and rename the downloaded `.rai` file as `<gguf_model>-encoder-vitisai.rai` alongside your ggml model files `<gguf_model>.bin`.
+
+- Build `whisper.cpp` with VitisAI support:
+
+  ```bash
+  cmake -B build -DWHISPER_VITISAI=1
+  cmake --build build -j --config Release
+  ```
+
+- Run the examples as usual. For example:
+
+  ```text
+  $ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```
+
+The VitisAI artifact from Huggingface is already optimized for Ryzen AI NPUs, there is no slow compilation needed. The acceleration advantage should be seen from first run itself apart from CPU caching overheads.
+
 ## NVIDIA GPU support
 
 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 095a2791..6cba1c6e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
     find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()
 
+if (WHISPER_VITISAI)
+    find_package(FlexmlRT REQUIRED)
+endif()
+
 #
 # libraries
 #
@@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
     set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()
 
+if (WHISPER_VITISAI)
+    set(TARGET whisper.vitisai)
+
+    add_library(${TARGET} OBJECT
+        vitisai/whisper-vitisai-encoder.h
+        vitisai/whisper-vitisai-encoder.cpp
+        )
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
+
+    # Add C++17 standard for MSVC
+    if (MSVC)
+        target_compile_options(${TARGET} PRIVATE /std:c++17)
+    endif()
+
+    target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
+    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+endif()
+
 # whisper
 
 add_library(whisper
@@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
     target_link_libraries(whisper PRIVATE whisper.openvino)
 endif()
 
+if (WHISPER_VITISAI)
+    target_link_libraries(whisper PRIVATE whisper.vitisai)
+endif()
+
 if (WHISPER_MKL)
     target_link_libraries(whisper PRIVATE MKL::MKL)
 endif()
diff --git a/src/vitisai/whisper-vitisai-encoder.cpp b/src/vitisai/whisper-vitisai-encoder.cpp
new file mode 100644
index 00000000..a6d20a88
--- /dev/null
+++ b/src/vitisai/whisper-vitisai-encoder.cpp
@@ -0,0 +1,204 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#include "vitisai/whisper-vitisai-encoder.h"
+#include "FlexMLClient.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstdio>
+#include <cstdlib>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/mman.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+#endif
+#include <cstring>
+#include <string>
+
+struct whisper_vitisai_context {
+    std::string model_path;
+    std::shared_ptr<flexmlrt::client::Model> runner;
+    uint8_t * fbs_buffer;
+    size_t fbs_buffer_size;
+};
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
+#ifdef _WIN32
+    // Open the file
+    HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    LARGE_INTEGER fileSize;
+    if (!GetFileSizeEx(hFile, &fileSize)) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Create a file mapping object
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
+    if (hMapping == NULL) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Map the file
+    *buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
+    if (*buffer == NULL) {
+        CloseHandle(hMapping);
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = fileSize.QuadPart;
+    return true;
+#else
+    // Open the file
+    FILE * fd = fopen(path, "rb");
+    if (!fd) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    struct stat st;
+    if (fstat(fileno(fd), &st) == -1) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Mmap the file
+    *buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
+    if (*buffer == MAP_FAILED) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = st.st_size;
+    return true;
+#endif // _WIN32
+}
+
+void unmap_rai_file(uint8_t * buffer, size_t size) {
+#ifdef _WIN32
+    UnmapViewOfFile(buffer);
+#else
+    munmap(buffer, size);
+#endif // _WIN32
+}
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
+    if (!path_model) {
+        std::fprintf(stderr, "%s: path_model is null\n", __func__);
+        return nullptr;
+    }
+
+    auto * ctx = new whisper_vitisai_context;
+    ctx->model_path = path_model;
+
+    // Override the model path with the environment variable if it is set
+    if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
+        if (env_model_path[0] != '\0') {
+            ctx->model_path = env_model_path;
+        }
+    }
+
+    // Step 1: Set up the model
+    flexmlrt::client::Options options;
+    options.modelPath = ctx->model_path;
+    options.deviceName = "stx";
+    options.debug = false;
+    options.executeMode = 2;
+    options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
+    options.extOptions["enable_preemption"] = true;
+
+    // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
+    if (ctx->model_path.find(".rai") != std::string::npos) {
+        // mmap rai file for both Linux and Windows and pass the buffer to the options
+        ctx->fbs_buffer = nullptr;
+        ctx->fbs_buffer_size = 0;
+        if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
+            options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
+            options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
+            options.subgraphName = "vaiml_par_0";
+            options.extOptions["cache_dir"] = std::string(".");
+        } else {
+            std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
+            delete ctx;
+            return nullptr;
+        }
+    }
+
+    try {
+        ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
+
+        if (!ctx->runner->good()) {
+            throw std::runtime_error("Runner creation ran into an error");
+        }
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
+        delete ctx;
+        return nullptr;
+    }
+    return ctx;
+}
+
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
+    if (!ctx) {
+        return;
+    }
+
+    std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
+    if (ctx->fbs_buffer) {
+        unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
+    }
+    delete ctx;
+}
+
+int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
+    if (!ctx || !mel || !out) {
+        std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
+        return 0;
+    }
+
+    if (ggml_n_dims(mel) != 2) {
+        std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
+        return 0;
+    }
+
+    if (ggml_n_dims(out) != 2) {
+        std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
+        return 0;
+    }
+
+    // setup input and output tensors for Vitis AI model
+    std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
+    auto model = ctx->runner;
+
+    // Get tensors as CPU tensors (hwTensor = false)
+    input_tensors = model->getIOTensors("input", false);
+    output_tensors = model->getIOTensors("output", false);
+
+    // TODO: add assert checks for tensor numbers and shapes
+
+    input_tensors[0].data = mel->data;
+    output_tensors[0].data = out->data;
+
+    try {
+        model->forward(input_tensors, output_tensors);
+        std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
+        return 0;
+    }
+
+    return 1;
+}
diff --git a/src/vitisai/whisper-vitisai-encoder.h b/src/vitisai/whisper-vitisai-encoder.h
new file mode 100644
index 00000000..05dc812b
--- /dev/null
+++ b/src/vitisai/whisper-vitisai-encoder.h
@@ -0,0 +1,32 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdbool>
+#include <cstdint>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_vitisai_context;
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
+// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
+void unmap_rai_file(uint8_t * buffer, size_t size);
+
+struct ggml_tensor;
+
+int whisper_vitisai_encode(
+    struct whisper_vitisai_context * ctx,
+    struct ggml_tensor * mel,
+    struct ggml_tensor * out);
+
+#if __cplusplus
+}
+#endif
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 5b6e4b4b..59dd59c5 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -14,6 +14,10 @@
 #include "openvino/whisper-openvino-encoder.h"
 #endif
 
+#ifdef WHISPER_USE_VITISAI
+#include "vitisai/whisper-vitisai-encoder.h"
+#endif
+
 #include <atomic>
 #include <algorithm>
 #include <cassert>
@@ -903,6 +907,10 @@ struct whisper_state {
     whisper_openvino_context * ctx_openvino = nullptr;
 #endif
 
+#ifdef WHISPER_USE_VITISAI
+    whisper_vitisai_context * ctx_vitisai = nullptr;
+#endif
+
     // [EXPERIMENTAL] token-level timestamps data
     int64_t t_beg  = 0;
     int64_t t_last = 0;
@@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
     const bool use_openvino = wstate.ctx_openvino != nullptr;
 #endif
 
-    return use_coreml || use_openvino;
+#ifndef WHISPER_USE_VITISAI
+    const bool use_vitisai = false;
+#else
+    const bool use_vitisai = wstate.ctx_vitisai != nullptr;
+#endif
+
+    return use_coreml || use_openvino || use_vitisai;
 }
 
 static struct ggml_cgraph * whisper_build_graph_conv(
@@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(
 
 #if defined(WHISPER_USE_COREML)
             whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
+#elif defined(WHISPER_USE_VITISAI)
+            whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
 #elif defined(WHISPER_USE_OPENVINO)
             whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
 #endif
@@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 }
 #endif
 
+#ifdef WHISPER_USE_VITISAI
+// replace extension with Vitis AI encoder artifact
+static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder-vitisai.rai";
+
+    return path_bin;
+}
+#endif
+
 #ifdef WHISPER_USE_OPENVINO
 // replace .bin with-encoder-openvino.xml
 static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
@@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     }
 #endif
 
+#ifdef WHISPER_USE_VITISAI
+    const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
+
+    state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
+    if (!state->ctx_vitisai) {
+        WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
+        whisper_free_state(state);
+        return nullptr;
+    } else {
+        WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
+    }
+#endif
+
     state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
 
     state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
@@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
         }
 #endif
 
+#ifdef WHISPER_USE_VITISAI
+        if (state->ctx_vitisai != nullptr) {
+            whisper_vitisai_free(state->ctx_vitisai);
+            state->ctx_vitisai = nullptr;
+        }
+#endif
+
         whisper_batch_free(state->batch);
 
         ggml_backend_sched_free(state->sched_conv.sched);
@@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
 #endif
 }
 
+static int whisper_has_vitisai(void) {
+#ifdef WHISPER_USE_VITISAI
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 const char * whisper_print_system_info(void) {
     static std::string s;
 
     s  = "";
     s += "WHISPER : ";
+    s += "VITISAI = "   + std::to_string(whisper_has_vitisai())    + " | ";
     s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
     s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
 

From 1a98960e5c21e8d2aadf5316d65b8615cfdb2eee Mon Sep 17 00:00:00 2001
From: Iswarya Alex <47045679+iswaryaalex@users.noreply.github.com>
Date: Tue, 27 Jan 2026 11:51:47 -0800
Subject: [PATCH 2/4] Update README.md

- RAI EULA Links
- Updated for RAI Whisper instructions
---
 README.md | 56 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 0369f142..91a4b114 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
+- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
 - [OpenVINO Support](#openvino-support)
 - [Ascend NPU Support](#ascend-npu-support)
 - [Moore Threads GPU Support](#moore-threads-gpu-support)
@@ -312,34 +313,47 @@ This can result in significant speedup in encoder performance. Here are the inst
 
 For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
 
-## VitisAI encoder support
+## AMD Ryzen™ AI support for NPU
 
-On AMD Ryzen AI NPU devices, you can run the Encoder via the VitisAI plugin to significantly accelerate the whisper models.
+On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
+> **Note:**  
+> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.  
+> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
 
-- Prepare the AMD runtime packages (required before building):
+### Setup environment (Windows only)
 
-  - Obtain the XRT package and the FlexmlRT package from AMD. Both are distributed as tarballs or wheels.
-  - Copy the downloaded archives to a local path, extract them, and run the setup script from each extracted package in your shell (for example `source /path/to/xrt/setup.sh` and `source /path/to/flexmlrt/setup.sh`). Run these in every new shell you use to build or run `whisper.cpp`.
-
-- Fetch the prebuilt VitisAI encoder cache:
-
-  - Download the appropriate Whisper encoder `.rai` cache for your model size from the AMD collection on Hugging Face: https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
-  - Place and rename the downloaded `.rai` file as `<gguf_model>-encoder-vitisai.rai` alongside your ggml model files `<gguf_model>.bin`.
-
-- Build `whisper.cpp` with VitisAI support:
-
-  ```bash
-  cmake -B build -DWHISPER_VITISAI=1
-  cmake --build build -j --config Release
+- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
+- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=flexmlrt1.7.0-win.zip).
+- **Environment:** Extract the runtime package and set up the environment:
+  ```powershell
+  tar xvf flexmlrt1.7.0-win.zip
+  flexmlrt\setup.bat
   ```
+Your environment is now ready.
 
-- Run the examples as usual. For example:
+### Build Whisper.cpp for Ryzen™ AI support
 
-  ```text
-  $ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
-  ```
+```bash
+cmake -B build -DWHISPER_VITISAI=1
+cmake --build build -j --config Release
+```
+
+### Download NPU-optimized models
+
+- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:  
+  https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
+- Download the pre-compiled `.rai` cache file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
+  The cache file must be named with the `-encoder-vitisai.rai` suffix. For example, if your model file is named `ggml-small.bin`, the cache file should be named `ggml-small-encoder-vitisai.rai`.
+
+
+> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
+
+Run the examples as usual:
+
+```bash
+./build/bin/whisper-cli -m models/ggml-small.bin -f samples/jfk.wav
+```
 
-The VitisAI artifact from Huggingface is already optimized for Ryzen AI NPUs, there is no slow compilation needed. The acceleration advantage should be seen from first run itself apart from CPU caching overheads.
 
 ## NVIDIA GPU support
 

From 175b9a53451b13e103b37194c8cb9f66de5c91fa Mon Sep 17 00:00:00 2001
From: Sachin Kumawat <sachink@amd.com>
Date: Thu, 26 Feb 2026 12:42:08 -0800
Subject: [PATCH 3/4] Cleanup and add runtime print debug guard

---
 src/vitisai/whisper-vitisai-encoder.cpp | 9 +++++----
 src/vitisai/whisper-vitisai-encoder.h   | 7 -------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/vitisai/whisper-vitisai-encoder.cpp b/src/vitisai/whisper-vitisai-encoder.cpp
index a6d20a88..c10e1c37 100644
--- a/src/vitisai/whisper-vitisai-encoder.cpp
+++ b/src/vitisai/whisper-vitisai-encoder.cpp
@@ -1,4 +1,3 @@
-// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 #include "vitisai/whisper-vitisai-encoder.h"
 #include "FlexMLClient.h"
 #include "ggml.h"
@@ -24,7 +23,7 @@ struct whisper_vitisai_context {
 };
 
 // Function to mmap rai file for Linux and MapViewOfFile for Windows
-bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
+static bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
 #ifdef _WIN32
     // Open the file
     HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
@@ -87,7 +86,7 @@ bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
 #endif // _WIN32
 }
 
-void unmap_rai_file(uint8_t * buffer, size_t size) {
+static void unmap_rai_file(uint8_t * buffer, size_t size) {
 #ifdef _WIN32
     UnmapViewOfFile(buffer);
 #else
@@ -194,7 +193,9 @@ int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_ten
 
     try {
         model->forward(input_tensors, output_tensors);
-        std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
+#if defined(WHISPER_DEBUG)
+        std::fprintf(stderr, "%s: Vitis AI model inference completed.\n", __func__);
+#endif
     } catch (const std::exception & e) {
         std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
         return 0;
diff --git a/src/vitisai/whisper-vitisai-encoder.h b/src/vitisai/whisper-vitisai-encoder.h
index 05dc812b..840ce694 100644
--- a/src/vitisai/whisper-vitisai-encoder.h
+++ b/src/vitisai/whisper-vitisai-encoder.h
@@ -1,5 +1,3 @@
-// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
 #pragma once
 
 #include <cstddef>
@@ -15,11 +13,6 @@ struct whisper_vitisai_context;
 struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
 void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
 
-// Function to mmap rai file for Linux and MapViewOfFile for Windows
-bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
-// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
-void unmap_rai_file(uint8_t * buffer, size_t size);
-
 struct ggml_tensor;
 
 int whisper_vitisai_encode(

From 988a4af6f1b7a1153b5d37e483d4e7f5caa605b4 Mon Sep 17 00:00:00 2001
From: Sachin Kumawat <sachink@amd.com>
Date: Tue, 3 Mar 2026 16:37:29 -0800
Subject: [PATCH 4/4] turn off profiling

---
 src/vitisai/whisper-vitisai-encoder.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/vitisai/whisper-vitisai-encoder.cpp b/src/vitisai/whisper-vitisai-encoder.cpp
index c10e1c37..580bcfe3 100644
--- a/src/vitisai/whisper-vitisai-encoder.cpp
+++ b/src/vitisai/whisper-vitisai-encoder.cpp
@@ -116,7 +116,6 @@ struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
     options.deviceName = "stx";
     options.debug = false;
     options.executeMode = 2;
-    options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
     options.extOptions["enable_preemption"] = true;
 
     // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options