Add VitisAI Plugin

* Added VitisAI encoder module placeholder files

* VitisAI build integration

* VitisAI encoder offload functional

* Clean up vitisai integration

* Add c++17 requirement for Windows

* Enabled preemption for windows runs

* Add model cache override option

* Remove vitisai premature log message

* Add rai support through file mapping

* Fixed flatbuffer loading

* Fixed Windows file mapping issue

* Update FlexmlRT resolution

* Use Flexmlrt wheel pkg to build VitisAI plugin

* Clean up

* Remove prints

* Change flexmlrt target from Shared to Interface

* Add c++17 requirement for Windows

* Enabled preemption for windows runs

* Add rai support through file mapping

* Fixed flatbuffer loading

* Fixed Windows file mapping issue

* Update FlexmlRT resolution

* Use Flexmlrt wheel pkg to build VitisAI plugin

* Clean up

* Remove prints

* Change flexmlrt target from Shared to Interface

* Cleanup FlexmlRT integration

* format fix

* Adding AMD Licenses

* Update CMakeLists.txt

Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com>

* Update src/CMakeLists.txt

Co-authored-by: Kumawat, Sachin <sachin.kumawat@amd.com>

* Update whisper.cpp

* Added VitisAI encoder readme section

* Remove license headers from common files to whisper.cpp

---------

Co-authored-by: Sachin Kumawat <sachink@amd.com>
Co-authored-by: Jeff Lin <jeffylin@xilinx.com>
Co-authored-by: Lin <jefflin@amd.com>
Co-authored-by: Lin, Jeff (DCG-ENG) <jeff.lin@amd.com>
Co-authored-by: Iswarya Alex <iswaryaalex96@gmail.com>
Co-authored-by: Alex, Iswarya <Iswarya.Alex@amd.com>
This commit is contained in:
Kumawat, Sachin 2026-01-13 14:14:27 -08:00 committed by Sachin Kumawat
parent a96310871a
commit 66e882aeed
6 changed files with 358 additions and 1 deletions

View File

@ -91,6 +91,7 @@ endif()
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
option(WHISPER_VITISAI "whisper: support for AMD Vitis AI" OFF)
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

View File

@ -312,6 +312,35 @@ This can result in significant speedup in encoder performance. Here are the inst
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
## VitisAI encoder support
On AMD Ryzen AI NPU devices, you can run the Encoder via the VitisAI plugin to significantly accelerate the whisper models.
- Prepare the AMD runtime packages (required before building):
- Obtain the XRT package and the FlexmlRT package from AMD. Both are distributed as tarballs or wheels.
- Copy the downloaded archives to a local path, extract them, and run the setup script from each extracted package in your shell (for example `source /path/to/xrt/setup.sh` and `source /path/to/flexmlrt/setup.sh`). Run these in every new shell you use to build or run `whisper.cpp`.
- Fetch the prebuilt VitisAI encoder cache:
- Download the appropriate Whisper encoder `.rai` cache for your model size from the AMD collection on Hugging Face: https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
- Place and rename the downloaded `.rai` file as `<gguf_model>-encoder-vitisai.rai` alongside your ggml model files `<gguf_model>.bin`.
- Build `whisper.cpp` with VitisAI support:
```bash
cmake -B build -DWHISPER_VITISAI=1
cmake --build build -j --config Release
```
- Run the examples as usual. For example:
```text
$ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
```
The VitisAI artifact from Huggingface is already optimized for Ryzen AI NPUs, there is no slow compilation needed. The acceleration advantage should be seen from first run itself apart from CPU caching overheads.
## NVIDIA GPU support
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.

View File

@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()
if (WHISPER_VITISAI)
find_package(FlexmlRT REQUIRED)
endif()
#
# libraries
#
@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
if (WHISPER_VITISAI)
set(TARGET whisper.vitisai)
add_library(${TARGET} OBJECT
vitisai/whisper-vitisai-encoder.h
vitisai/whisper-vitisai-encoder.cpp
)
target_include_directories(${TARGET} PUBLIC
.
)
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
# Add C++17 standard for MSVC
if (MSVC)
target_compile_options(${TARGET} PRIVATE /std:c++17)
endif()
target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
# whisper
add_library(whisper
@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
target_link_libraries(whisper PRIVATE whisper.openvino)
endif()
if (WHISPER_VITISAI)
target_link_libraries(whisper PRIVATE whisper.vitisai)
endif()
if (WHISPER_MKL)
target_link_libraries(whisper PRIVATE MKL::MKL)
endif()

View File

@ -0,0 +1,204 @@
// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
#include "vitisai/whisper-vitisai-encoder.h"
#include "FlexMLClient.h"
#include "ggml.h"
#include "ggml-backend.h"
#include <cstdio>
#include <cstdlib>
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#endif
#include <cstring>
#include <string>
struct whisper_vitisai_context {
std::string model_path;
std::shared_ptr<flexmlrt::client::Model> runner;
uint8_t * fbs_buffer;
size_t fbs_buffer_size;
};
// Function to mmap rai file for Linux and MapViewOfFile for Windows
bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
#ifdef _WIN32
// Open the file
HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hFile == INVALID_HANDLE_VALUE) {
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Get the file size
LARGE_INTEGER fileSize;
if (!GetFileSizeEx(hFile, &fileSize)) {
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Create a file mapping object
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
if (hMapping == NULL) {
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Map the file
*buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
if (*buffer == NULL) {
CloseHandle(hMapping);
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
return false;
}
*size = fileSize.QuadPart;
return true;
#else
// Open the file
FILE * fd = fopen(path, "rb");
if (!fd) {
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Get the file size
struct stat st;
if (fstat(fileno(fd), &st) == -1) {
fclose(fd);
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Mmap the file
*buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
if (*buffer == MAP_FAILED) {
fclose(fd);
std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
return false;
}
*size = st.st_size;
return true;
#endif // _WIN32
}
void unmap_rai_file(uint8_t * buffer, size_t size) {
#ifdef _WIN32
UnmapViewOfFile(buffer);
#else
munmap(buffer, size);
#endif // _WIN32
}
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
if (!path_model) {
std::fprintf(stderr, "%s: path_model is null\n", __func__);
return nullptr;
}
auto * ctx = new whisper_vitisai_context;
ctx->model_path = path_model;
// Override the model path with the environment variable if it is set
if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
if (env_model_path[0] != '\0') {
ctx->model_path = env_model_path;
}
}
// Step 1: Set up the model
flexmlrt::client::Options options;
options.modelPath = ctx->model_path;
options.deviceName = "stx";
options.debug = false;
options.executeMode = 2;
options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
options.extOptions["enable_preemption"] = true;
// Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
if (ctx->model_path.find(".rai") != std::string::npos) {
// mmap rai file for both Linux and Windows and pass the buffer to the options
ctx->fbs_buffer = nullptr;
ctx->fbs_buffer_size = 0;
if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
options.subgraphName = "vaiml_par_0";
options.extOptions["cache_dir"] = std::string(".");
} else {
std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
delete ctx;
return nullptr;
}
}
try {
ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
if (!ctx->runner->good()) {
throw std::runtime_error("Runner creation ran into an error");
}
} catch (const std::exception & e) {
std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
delete ctx;
return nullptr;
}
return ctx;
}
void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
if (!ctx) {
return;
}
std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
if (ctx->fbs_buffer) {
unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
}
delete ctx;
}
int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
if (!ctx || !mel || !out) {
std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
return 0;
}
if (ggml_n_dims(mel) != 2) {
std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
return 0;
}
if (ggml_n_dims(out) != 2) {
std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
return 0;
}
// setup input and output tensors for Vitis AI model
std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
auto model = ctx->runner;
// Get tensors as CPU tensors (hwTensor = false)
input_tensors = model->getIOTensors("input", false);
output_tensors = model->getIOTensors("output", false);
// TODO: add assert checks for tensor numbers and shapes
input_tensors[0].data = mel->data;
output_tensors[0].data = out->data;
try {
model->forward(input_tensors, output_tensors);
std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
} catch (const std::exception & e) {
std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
return 0;
}
return 1;
}

View File

@ -0,0 +1,32 @@
// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstddef>
#include <cstdbool>
#include <cstdint>
#if __cplusplus
extern "C" {
#endif
struct whisper_vitisai_context;
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
// Function to mmap rai file for Linux and MapViewOfFile for Windows
bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
void unmap_rai_file(uint8_t * buffer, size_t size);
struct ggml_tensor;
int whisper_vitisai_encode(
struct whisper_vitisai_context * ctx,
struct ggml_tensor * mel,
struct ggml_tensor * out);
#if __cplusplus
}
#endif

View File

@ -14,6 +14,10 @@
#include "openvino/whisper-openvino-encoder.h"
#endif
#ifdef WHISPER_USE_VITISAI
#include "vitisai/whisper-vitisai-encoder.h"
#endif
#include <atomic>
#include <algorithm>
#include <cassert>
@ -903,6 +907,10 @@ struct whisper_state {
whisper_openvino_context * ctx_openvino = nullptr;
#endif
#ifdef WHISPER_USE_VITISAI
whisper_vitisai_context * ctx_vitisai = nullptr;
#endif
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0;
int64_t t_last = 0;
@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
const bool use_openvino = wstate.ctx_openvino != nullptr;
#endif
return use_coreml || use_openvino;
#ifndef WHISPER_USE_VITISAI
const bool use_vitisai = false;
#else
const bool use_vitisai = wstate.ctx_vitisai != nullptr;
#endif
return use_coreml || use_openvino || use_vitisai;
}
static struct ggml_cgraph * whisper_build_graph_conv(
@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(
#if defined(WHISPER_USE_COREML)
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
#elif defined(WHISPER_USE_VITISAI)
whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
#elif defined(WHISPER_USE_OPENVINO)
whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
#endif
@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
}
#endif
#ifdef WHISPER_USE_VITISAI
// replace extension with Vitis AI encoder artifact
static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += "-encoder-vitisai.rai";
return path_bin;
}
#endif
#ifdef WHISPER_USE_OPENVINO
// replace .bin with-encoder-openvino.xml
static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
}
#endif
#ifdef WHISPER_USE_VITISAI
const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
if (!state->ctx_vitisai) {
WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
whisper_free_state(state);
return nullptr;
} else {
WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
}
#endif
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
}
#endif
#ifdef WHISPER_USE_VITISAI
if (state->ctx_vitisai != nullptr) {
whisper_vitisai_free(state->ctx_vitisai);
state->ctx_vitisai = nullptr;
}
#endif
whisper_batch_free(state->batch);
ggml_backend_sched_free(state->sched_conv.sched);
@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
#endif
}
static int whisper_has_vitisai(void) {
#ifdef WHISPER_USE_VITISAI
return 1;
#else
return 0;
#endif
}
const char * whisper_print_system_info(void) {
static std::string s;
s = "";
s += "WHISPER : ";
s += "VITISAI = " + std::to_string(whisper_has_vitisai()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";