SYCL: implement ggml_sycl_pool_vmm (llama/22862)

* SYCL: implement ggml_sycl_pool_vmm

* Add an option to bypass VMM with GGML_SYCL_DISABLE_VMM

* Clean up debugging logging

* document GGML_SYCL_DISABLE_VMM

* Multi-stream MoE optimization

* Revert "Multi-stream MoE optimization"

This reverts commit 938929c3f13a562ec67c59e87cc5d38595444cce.

* Update common.hpp

Co-authored-by: Neo Zhang <zhang.jianyu@outlook.com>

* Flip GGML_SYCL_DISABLE_VMM to GGML_SYCL_ENABLE_VMM

* add logging for GGML_SYCL_ENABLE_VMM when extension is not available (SYCL_EXT_ONEAPI_VIRTUAL_MEM macro)

* Apply suggestions from code review

Co-authored-by: Alexey Kopytko <alexey@kopytko.com>

* Apply suggestion from @sanmai

* Apply suggestion from @sanmai

---------

Co-authored-by: Neo Zhang <zhang.jianyu@outlook.com>
This commit is contained in:
Alexey Kopytko 2026-05-26 13:59:00 +09:00 committed by Georgi Gerganov
parent 00a5110b19
commit 049f0af339
2 changed files with 163 additions and 11 deletions

View File

@ -224,6 +224,7 @@ struct sycl_device_info {
int max_wg_per_cu; // max work groups per compute unit - refer to
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
size_t total_vram;
sycl_hw_info hw_info;
optimize_feature opt_feature;
@ -244,6 +245,8 @@ struct ggml_sycl_device_info {
const ggml_sycl_device_info & ggml_sycl_info();
static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
struct ggml_sycl_pool {
virtual ~ggml_sycl_pool() = default;

View File

@ -19,6 +19,7 @@
#include <cstdlib>
#include <float.h>
#include <limits>
#include <optional>
#include <stdint.h>
#include <stdio.h>
#include <vector>
@ -37,6 +38,11 @@
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
#endif
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
# define GGML_SYCL_USE_VMM
#endif
#include <sycl/half_type.hpp>
#include "ggml.h"
@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_enable_vmm = 1;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
// #endif
for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0;
dpct::device_info prop;
auto & device = dpct::dev_mgr::instance().get_device(i);
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, device)));
#if !defined(GGML_SYCL_USE_VMM)
info.devices[i].vmm = 0;
#else
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
if (info.devices[i].vmm) {
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
// but the L0 API requires a larger page size for allocs above 2 MiB and
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
// Here we clamp it to 2 MiB for simplicity, but other devices may require
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
const size_t physical_page = 2ull << 20; // 2 MiB
info.devices[i].vmm_granularity = std::max<size_t>(
sycl::ext::oneapi::experimental::get_mem_granularity(
device, sycl::context(device)),
physical_page);
}
#endif
info.default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
@ -234,6 +258,7 @@ static void ggml_check_sycl() try {
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
@ -275,6 +300,11 @@ static void ggml_check_sycl() try {
#else
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
#endif
#if defined(GGML_SYCL_USE_VMM)
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
#else
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
#endif
GGML_LOG_INFO("Running with Environment Variables:\n");
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
@ -293,6 +323,11 @@ static void ggml_check_sycl() try {
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
#endif
#if defined(GGML_SYCL_USE_VMM)
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
#else
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
#endif
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
@ -754,7 +789,7 @@ catch (sycl::exception const &exc) {
}
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 128;
return SYCL_BUFFER_ALIGNMENT;
GGML_UNUSED(buft);
}
@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
}
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 128;
return SYCL_BUFFER_ALIGNMENT;
GGML_UNUSED(buft);
}
@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
}
};
// pool with virtual memory management
#if defined(GGML_SYCL_USE_VMM)
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
int device;
sycl::context ctx;
sycl::device dev;
uintptr_t pool_addr = 0;
size_t pool_used = 0;
size_t pool_size = 0;
size_t granularity;
// physical_mem owns the commits (unlike cuMemMap)
struct mapping {
sycl::ext::oneapi::experimental::physical_mem phys;
void * map_ptr;
};
std::vector<mapping> mappings;
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
device(device_),
ctx(qptr_->get_context()),
dev(qptr_->get_device()),
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
}
~ggml_sycl_pool_vmm() {
if (pool_addr == 0) {
return;
}
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
// physical_mem::map() call and (b) precede destruction of the
// physical_mem objects (their dtors won't unmap).
for (auto & m : mappings) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
m.map_ptr, m.phys.size(), ctx)));
}
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
}
void * alloc(size_t size, size_t * actual_size) override {
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
size_t avail = pool_size - pool_used;
if (size > avail) {
// round up to the next multiple of the granularity
size_t reserve_size = GGML_PAD(size - avail, granularity);
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
// allocate more physical memory
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
// reserve virtual address space (if not already reserved)
if (pool_addr == 0) {
SYCL_CHECK(CHECK_TRY_ERROR(
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
SYCL_POOL_VMM_MAX_SIZE, ctx)));
}
// map at the end of the pool
void * map_ptr = nullptr;
SYCL_CHECK(CHECK_TRY_ERROR(
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
// stash these so we could unmap this exact range in dtor
mappings.push_back({
std::move(*phys),
map_ptr,
});
// add to the pool
pool_size += reserve_size;
#ifdef DEBUG_SYCL_MALLOC
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
device, (unsigned long long) (pool_size/1024/1024),
(unsigned long long) (reserve_size/1024/1024));
#endif
}
GGML_ASSERT(pool_addr != 0);
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
*actual_size = size;
pool_used += size;
#ifdef DEBUG_SYCL_MALLOC
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
#endif
return ptr;
}
void free(void * ptr, size_t size) override {
#ifdef DEBUG_SYCL_MALLOC
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
#endif
pool_used -= size;
// all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
}
};
#endif // defined(GGML_SYCL_USE_VMM)
struct ggml_sycl_pool_host : public ggml_sycl_pool {
queue_ptr qptr;
int device;
@ -1542,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
}
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
// TBD: NO VMM support
// if (ggml_sycl_info().devices[device].vmm) {
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
// }
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
#if defined(GGML_SYCL_USE_VMM)
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
}
#endif // defined(GGML_SYCL_USE_VMM)
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
}
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
}
// TBD pool with virtual memory management
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
/// kernels
typedef void (*ggml_sycl_op_mul_mat_t)(
ggml_backend_sycl_context & ctx,