This commit is contained in:
Sachin Kumawat 2026-04-23 18:51:45 +00:00 committed by GitHub
commit 0eb40f1138
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 365 additions and 1 deletions

View File

@ -91,6 +91,7 @@ endif()
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
option(WHISPER_VITISAI "whisper: support for AMD Vitis AI" OFF)
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

View File

@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- [Vulkan support](#vulkan-gpu-support)
- Support for CPU-only inference
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
- [OpenVINO Support](#openvino-support)
- [Ascend NPU Support](#ascend-npu-support)
- [Moore Threads GPU Support](#moore-threads-gpu-support)
@ -312,6 +313,48 @@ This can result in significant speedup in encoder performance. Here are the inst
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
## AMD Ryzen™ AI support for NPU
On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
> **Note:**
> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.
> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
### Setup environment (Windows only)
- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=flexmlrt1.7.0-win.zip).
- **Environment:** Extract the runtime package and set up the environment:
```powershell
tar xvf flexmlrt1.7.0-win.zip
flexmlrt\setup.bat
```
Your environment is now ready.
### Build Whisper.cpp for Ryzen™ AI support
```bash
cmake -B build -DWHISPER_VITISAI=1
cmake --build build -j --config Release
```
### Download NPU-optimized models
- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:
https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
- Download the pre-compiled `.rai` cache file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
The cache file must be named with the `-encoder-vitisai.rai` suffix. For example, if your model file is named `ggml-small.bin`, the cache file should be named `ggml-small-encoder-vitisai.rai`.
> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
Run the examples as usual:
```bash
./build/bin/whisper-cli -m models/ggml-small.bin -f samples/jfk.wav
```
## NVIDIA GPU support
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.

View File

@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()
if (WHISPER_VITISAI)
find_package(FlexmlRT REQUIRED)
endif()
#
# libraries
#
@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
if (WHISPER_VITISAI)
set(TARGET whisper.vitisai)
add_library(${TARGET} OBJECT
vitisai/whisper-vitisai-encoder.h
vitisai/whisper-vitisai-encoder.cpp
)
target_include_directories(${TARGET} PUBLIC
.
)
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
# Add C++17 standard for MSVC
if (MSVC)
target_compile_options(${TARGET} PRIVATE /std:c++17)
endif()
target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
# whisper
add_library(whisper
@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
target_link_libraries(whisper PRIVATE whisper.openvino)
endif()
if (WHISPER_VITISAI)
target_link_libraries(whisper PRIVATE whisper.vitisai)
endif()
if (WHISPER_MKL)
target_link_libraries(whisper PRIVATE MKL::MKL)
endif()

View File

@ -0,0 +1,204 @@
#include "vitisai/whisper-vitisai-encoder.h"
#include "FlexMLClient.h"
#include "ggml.h"
#include "ggml-backend.h"
#include <cstdio>
#include <cstdlib>
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#endif
#include <cstring>
#include <string>
struct whisper_vitisai_context {
std::string model_path;
std::shared_ptr<flexmlrt::client::Model> runner;
uint8_t * fbs_buffer;
size_t fbs_buffer_size;
};
// Function to mmap rai file for Linux and MapViewOfFile for Windows
static bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
#ifdef _WIN32
// Open the file
HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (hFile == INVALID_HANDLE_VALUE) {
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Get the file size
LARGE_INTEGER fileSize;
if (!GetFileSizeEx(hFile, &fileSize)) {
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Create a file mapping object
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
if (hMapping == NULL) {
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Map the file
*buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
if (*buffer == NULL) {
CloseHandle(hMapping);
CloseHandle(hFile);
std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
return false;
}
*size = fileSize.QuadPart;
return true;
#else
// Open the file
FILE * fd = fopen(path, "rb");
if (!fd) {
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Get the file size
struct stat st;
if (fstat(fileno(fd), &st) == -1) {
fclose(fd);
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
return false;
}
// Mmap the file
*buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
if (*buffer == MAP_FAILED) {
fclose(fd);
std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
return false;
}
*size = st.st_size;
return true;
#endif // _WIN32
}
static void unmap_rai_file(uint8_t * buffer, size_t size) {
#ifdef _WIN32
UnmapViewOfFile(buffer);
#else
munmap(buffer, size);
#endif // _WIN32
}
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
if (!path_model) {
std::fprintf(stderr, "%s: path_model is null\n", __func__);
return nullptr;
}
auto * ctx = new whisper_vitisai_context;
ctx->model_path = path_model;
// Override the model path with the environment variable if it is set
if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
if (env_model_path[0] != '\0') {
ctx->model_path = env_model_path;
}
}
// Step 1: Set up the model
flexmlrt::client::Options options;
options.modelPath = ctx->model_path;
options.deviceName = "stx";
options.debug = false;
options.executeMode = 2;
options.extOptions["enable_preemption"] = true;
// Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
if (ctx->model_path.find(".rai") != std::string::npos) {
// mmap rai file for both Linux and Windows and pass the buffer to the options
ctx->fbs_buffer = nullptr;
ctx->fbs_buffer_size = 0;
if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
options.subgraphName = "vaiml_par_0";
options.extOptions["cache_dir"] = std::string(".");
} else {
std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
delete ctx;
return nullptr;
}
}
try {
ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
if (!ctx->runner->good()) {
throw std::runtime_error("Runner creation ran into an error");
}
} catch (const std::exception & e) {
std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
delete ctx;
return nullptr;
}
return ctx;
}
void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
if (!ctx) {
return;
}
std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
if (ctx->fbs_buffer) {
unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
}
delete ctx;
}
int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
if (!ctx || !mel || !out) {
std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
return 0;
}
if (ggml_n_dims(mel) != 2) {
std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
return 0;
}
if (ggml_n_dims(out) != 2) {
std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
return 0;
}
// setup input and output tensors for Vitis AI model
std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
auto model = ctx->runner;
// Get tensors as CPU tensors (hwTensor = false)
input_tensors = model->getIOTensors("input", false);
output_tensors = model->getIOTensors("output", false);
// TODO: add assert checks for tensor numbers and shapes
input_tensors[0].data = mel->data;
output_tensors[0].data = out->data;
try {
model->forward(input_tensors, output_tensors);
#if defined(WHISPER_DEBUG)
std::fprintf(stderr, "%s: Vitis AI model inference completed.\n", __func__);
#endif
} catch (const std::exception & e) {
std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
return 0;
}
return 1;
}

View File

@ -0,0 +1,25 @@
#pragma once
#include <cstddef>
#include <cstdbool>
#include <cstdint>
#if __cplusplus
extern "C" {
#endif
struct whisper_vitisai_context;
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
struct ggml_tensor;
int whisper_vitisai_encode(
struct whisper_vitisai_context * ctx,
struct ggml_tensor * mel,
struct ggml_tensor * out);
#if __cplusplus
}
#endif

View File

@ -14,6 +14,10 @@
#include "openvino/whisper-openvino-encoder.h"
#endif
#ifdef WHISPER_USE_VITISAI
#include "vitisai/whisper-vitisai-encoder.h"
#endif
#include <atomic>
#include <algorithm>
#include <cassert>
@ -903,6 +907,10 @@ struct whisper_state {
whisper_openvino_context * ctx_openvino = nullptr;
#endif
#ifdef WHISPER_USE_VITISAI
whisper_vitisai_context * ctx_vitisai = nullptr;
#endif
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0;
int64_t t_last = 0;
@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
const bool use_openvino = wstate.ctx_openvino != nullptr;
#endif
return use_coreml || use_openvino;
#ifndef WHISPER_USE_VITISAI
const bool use_vitisai = false;
#else
const bool use_vitisai = wstate.ctx_vitisai != nullptr;
#endif
return use_coreml || use_openvino || use_vitisai;
}
static struct ggml_cgraph * whisper_build_graph_conv(
@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(
#if defined(WHISPER_USE_COREML)
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
#elif defined(WHISPER_USE_VITISAI)
whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
#elif defined(WHISPER_USE_OPENVINO)
whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
#endif
@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
}
#endif
#ifdef WHISPER_USE_VITISAI
// replace extension with Vitis AI encoder artifact
static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += "-encoder-vitisai.rai";
return path_bin;
}
#endif
#ifdef WHISPER_USE_OPENVINO
// replace .bin with-encoder-openvino.xml
static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
}
#endif
#ifdef WHISPER_USE_VITISAI
const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
if (!state->ctx_vitisai) {
WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
whisper_free_state(state);
return nullptr;
} else {
WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
}
#endif
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
}
#endif
#ifdef WHISPER_USE_VITISAI
if (state->ctx_vitisai != nullptr) {
whisper_vitisai_free(state->ctx_vitisai);
state->ctx_vitisai = nullptr;
}
#endif
whisper_batch_free(state->batch);
ggml_backend_sched_free(state->sched_conv.sched);
@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
#endif
}
static int whisper_has_vitisai(void) {
#ifdef WHISPER_USE_VITISAI
return 1;
#else
return 0;
#endif
}
const char * whisper_print_system_info(void) {
static std::string s;
s = "";
s += "WHISPER : ";
s += "VITISAI = " + std::to_string(whisper_has_vitisai()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";