Merge 988a4af6f1 into fc674574ca
This commit is contained in:
commit
0eb40f1138
|
|
@ -91,6 +91,7 @@ endif()
|
|||
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
||||
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
|
||||
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
|
||||
option(WHISPER_VITISAI "whisper: support for AMD Vitis AI" OFF)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
|
|
|||
43
README.md
43
README.md
|
|
@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
|||
- [Vulkan support](#vulkan-gpu-support)
|
||||
- Support for CPU-only inference
|
||||
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
|
||||
- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
|
||||
- [OpenVINO Support](#openvino-support)
|
||||
- [Ascend NPU Support](#ascend-npu-support)
|
||||
- [Moore Threads GPU Support](#moore-threads-gpu-support)
|
||||
|
|
@ -312,6 +313,48 @@ This can result in significant speedup in encoder performance. Here are the inst
|
|||
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
|
||||
|
||||
## AMD Ryzen™ AI support for NPU
|
||||
|
||||
On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
|
||||
> **Note:**
|
||||
> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.
|
||||
> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
|
||||
|
||||
### Setup environment (Windows only)
|
||||
|
||||
- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
|
||||
- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=flexmlrt1.7.0-win.zip).
|
||||
- **Environment:** Extract the runtime package and set up the environment:
|
||||
```powershell
|
||||
tar xvf flexmlrt1.7.0-win.zip
|
||||
flexmlrt\setup.bat
|
||||
```
|
||||
Your environment is now ready.
|
||||
|
||||
### Build Whisper.cpp for Ryzen™ AI support
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_VITISAI=1
|
||||
cmake --build build -j --config Release
|
||||
```
|
||||
|
||||
### Download NPU-optimized models
|
||||
|
||||
- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:
|
||||
https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
|
||||
- Download the pre-compiled `.rai` cache file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
|
||||
The cache file must be named with the `-encoder-vitisai.rai` suffix. For example, if your model file is named `ggml-small.bin`, the cache file should be named `ggml-small-encoder-vitisai.rai`.
|
||||
|
||||
|
||||
> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
|
||||
|
||||
Run the examples as usual:
|
||||
|
||||
```bash
|
||||
./build/bin/whisper-cli -m models/ggml-small.bin -f samples/jfk.wav
|
||||
```
|
||||
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
|
||||
|
|
|
|||
|
|
@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
|
|||
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
|
||||
endif()
|
||||
|
||||
if (WHISPER_VITISAI)
|
||||
find_package(FlexmlRT REQUIRED)
|
||||
endif()
|
||||
|
||||
#
|
||||
# libraries
|
||||
#
|
||||
|
|
@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
|
|||
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
|
||||
endif()
|
||||
|
||||
if (WHISPER_VITISAI)
|
||||
set(TARGET whisper.vitisai)
|
||||
|
||||
add_library(${TARGET} OBJECT
|
||||
vitisai/whisper-vitisai-encoder.h
|
||||
vitisai/whisper-vitisai-encoder.cpp
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC
|
||||
.
|
||||
)
|
||||
|
||||
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
|
||||
|
||||
# Add C++17 standard for MSVC
|
||||
if (MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE /std:c++17)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
|
||||
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
|
||||
endif()
|
||||
|
||||
# whisper
|
||||
|
||||
add_library(whisper
|
||||
|
|
@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
|
|||
target_link_libraries(whisper PRIVATE whisper.openvino)
|
||||
endif()
|
||||
|
||||
if (WHISPER_VITISAI)
|
||||
target_link_libraries(whisper PRIVATE whisper.vitisai)
|
||||
endif()
|
||||
|
||||
if (WHISPER_MKL)
|
||||
target_link_libraries(whisper PRIVATE MKL::MKL)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,204 @@
|
|||
#include "vitisai/whisper-vitisai-encoder.h"
|
||||
#include "FlexMLClient.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
struct whisper_vitisai_context {
|
||||
std::string model_path;
|
||||
std::shared_ptr<flexmlrt::client::Model> runner;
|
||||
uint8_t * fbs_buffer;
|
||||
size_t fbs_buffer_size;
|
||||
};
|
||||
|
||||
// Function to mmap rai file for Linux and MapViewOfFile for Windows
|
||||
static bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
|
||||
#ifdef _WIN32
|
||||
// Open the file
|
||||
HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
||||
if (hFile == INVALID_HANDLE_VALUE) {
|
||||
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the file size
|
||||
LARGE_INTEGER fileSize;
|
||||
if (!GetFileSizeEx(hFile, &fileSize)) {
|
||||
CloseHandle(hFile);
|
||||
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create a file mapping object
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
|
||||
if (hMapping == NULL) {
|
||||
CloseHandle(hFile);
|
||||
std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Map the file
|
||||
*buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
|
||||
if (*buffer == NULL) {
|
||||
CloseHandle(hMapping);
|
||||
CloseHandle(hFile);
|
||||
std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
*size = fileSize.QuadPart;
|
||||
return true;
|
||||
#else
|
||||
// Open the file
|
||||
FILE * fd = fopen(path, "rb");
|
||||
if (!fd) {
|
||||
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the file size
|
||||
struct stat st;
|
||||
if (fstat(fileno(fd), &st) == -1) {
|
||||
fclose(fd);
|
||||
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Mmap the file
|
||||
*buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
|
||||
if (*buffer == MAP_FAILED) {
|
||||
fclose(fd);
|
||||
std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
|
||||
return false;
|
||||
}
|
||||
*size = st.st_size;
|
||||
return true;
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
static void unmap_rai_file(uint8_t * buffer, size_t size) {
|
||||
#ifdef _WIN32
|
||||
UnmapViewOfFile(buffer);
|
||||
#else
|
||||
munmap(buffer, size);
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
|
||||
if (!path_model) {
|
||||
std::fprintf(stderr, "%s: path_model is null\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * ctx = new whisper_vitisai_context;
|
||||
ctx->model_path = path_model;
|
||||
|
||||
// Override the model path with the environment variable if it is set
|
||||
if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
|
||||
if (env_model_path[0] != '\0') {
|
||||
ctx->model_path = env_model_path;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: Set up the model
|
||||
flexmlrt::client::Options options;
|
||||
options.modelPath = ctx->model_path;
|
||||
options.deviceName = "stx";
|
||||
options.debug = false;
|
||||
options.executeMode = 2;
|
||||
options.extOptions["enable_preemption"] = true;
|
||||
|
||||
// Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
|
||||
if (ctx->model_path.find(".rai") != std::string::npos) {
|
||||
// mmap rai file for both Linux and Windows and pass the buffer to the options
|
||||
ctx->fbs_buffer = nullptr;
|
||||
ctx->fbs_buffer_size = 0;
|
||||
if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
|
||||
options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
|
||||
options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
|
||||
options.subgraphName = "vaiml_par_0";
|
||||
options.extOptions["cache_dir"] = std::string(".");
|
||||
} else {
|
||||
std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
|
||||
delete ctx;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
|
||||
|
||||
if (!ctx->runner->good()) {
|
||||
throw std::runtime_error("Runner creation ran into an error");
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
|
||||
delete ctx;
|
||||
return nullptr;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
|
||||
if (!ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
|
||||
if (ctx->fbs_buffer) {
|
||||
unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
|
||||
}
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
|
||||
if (!ctx || !mel || !out) {
|
||||
std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ggml_n_dims(mel) != 2) {
|
||||
std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ggml_n_dims(out) != 2) {
|
||||
std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
|
||||
return 0;
|
||||
}
|
||||
|
||||
// setup input and output tensors for Vitis AI model
|
||||
std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
|
||||
auto model = ctx->runner;
|
||||
|
||||
// Get tensors as CPU tensors (hwTensor = false)
|
||||
input_tensors = model->getIOTensors("input", false);
|
||||
output_tensors = model->getIOTensors("output", false);
|
||||
|
||||
// TODO: add assert checks for tensor numbers and shapes
|
||||
|
||||
input_tensors[0].data = mel->data;
|
||||
output_tensors[0].data = out->data;
|
||||
|
||||
try {
|
||||
model->forward(input_tensors, output_tensors);
|
||||
#if defined(WHISPER_DEBUG)
|
||||
std::fprintf(stderr, "%s: Vitis AI model inference completed.\n", __func__);
|
||||
#endif
|
||||
} catch (const std::exception & e) {
|
||||
std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdbool>
|
||||
#include <cstdint>
|
||||
|
||||
#if __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct whisper_vitisai_context;
|
||||
|
||||
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
|
||||
void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
int whisper_vitisai_encode(
|
||||
struct whisper_vitisai_context * ctx,
|
||||
struct ggml_tensor * mel,
|
||||
struct ggml_tensor * out);
|
||||
|
||||
#if __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -14,6 +14,10 @@
|
|||
#include "openvino/whisper-openvino-encoder.h"
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
#include "vitisai/whisper-vitisai-encoder.h"
|
||||
#endif
|
||||
|
||||
#include <atomic>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
|
@ -903,6 +907,10 @@ struct whisper_state {
|
|||
whisper_openvino_context * ctx_openvino = nullptr;
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
whisper_vitisai_context * ctx_vitisai = nullptr;
|
||||
#endif
|
||||
|
||||
// [EXPERIMENTAL] token-level timestamps data
|
||||
int64_t t_beg = 0;
|
||||
int64_t t_last = 0;
|
||||
|
|
@ -1970,7 +1978,13 @@ static bool whisper_encode_external(const whisper_state & wstate) {
|
|||
const bool use_openvino = wstate.ctx_openvino != nullptr;
|
||||
#endif
|
||||
|
||||
return use_coreml || use_openvino;
|
||||
#ifndef WHISPER_USE_VITISAI
|
||||
const bool use_vitisai = false;
|
||||
#else
|
||||
const bool use_vitisai = wstate.ctx_vitisai != nullptr;
|
||||
#endif
|
||||
|
||||
return use_coreml || use_openvino || use_vitisai;
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * whisper_build_graph_conv(
|
||||
|
|
@ -2411,6 +2425,8 @@ static bool whisper_encode_internal(
|
|||
|
||||
#if defined(WHISPER_USE_COREML)
|
||||
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
|
||||
#elif defined(WHISPER_USE_VITISAI)
|
||||
whisper_vitisai_encode(wstate.ctx_vitisai, mel, wstate.embd_enc);
|
||||
#elif defined(WHISPER_USE_OPENVINO)
|
||||
whisper_openvino_encode(wstate.ctx_openvino, mel, wstate.embd_enc);
|
||||
#endif
|
||||
|
|
@ -3346,6 +3362,20 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
// replace extension with Vitis AI encoder artifact
|
||||
static std::string whisper_get_vitisai_path_encoder_cache(std::string path_bin) {
|
||||
auto pos = path_bin.rfind('.');
|
||||
if (pos != std::string::npos) {
|
||||
path_bin = path_bin.substr(0, pos);
|
||||
}
|
||||
|
||||
path_bin += "-encoder-vitisai.rai";
|
||||
|
||||
return path_bin;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_OPENVINO
|
||||
// replace .bin with-encoder-openvino.xml
|
||||
static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
|
||||
|
|
@ -3455,6 +3485,19 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
const auto path_vitisai = whisper_get_vitisai_path_encoder_cache(ctx->path_model);
|
||||
|
||||
state->ctx_vitisai = whisper_vitisai_init(path_vitisai.c_str());
|
||||
if (!state->ctx_vitisai) {
|
||||
WHISPER_LOG_ERROR("%s: failed to load Vitis AI model from '%s'\n", __func__, path_vitisai.c_str());
|
||||
whisper_free_state(state);
|
||||
return nullptr;
|
||||
} else {
|
||||
WHISPER_LOG_INFO("%s: Vitis AI model loaded\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
||||
|
||||
state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
|
||||
|
|
@ -3821,6 +3864,13 @@ void whisper_free_state(struct whisper_state * state) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
if (state->ctx_vitisai != nullptr) {
|
||||
whisper_vitisai_free(state->ctx_vitisai);
|
||||
state->ctx_vitisai = nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
whisper_batch_free(state->batch);
|
||||
|
||||
ggml_backend_sched_free(state->sched_conv.sched);
|
||||
|
|
@ -4312,11 +4362,20 @@ static int whisper_has_openvino(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
static int whisper_has_vitisai(void) {
|
||||
#ifdef WHISPER_USE_VITISAI
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
const char * whisper_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "WHISPER : ";
|
||||
s += "VITISAI = " + std::to_string(whisper_has_vitisai()) + " | ";
|
||||
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
||||
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue