sycl: Add optional USM system allocations (llama/22526)

This introduces an optional feature to allocate large GPU buffers (≥ 1GB)
using USM system allocations if supported by the device. It allows using
buffers from the system allocator then letting the system manage memory
migrations between host and device as necessary.

This feature is disabled by default and requires the GGML_SYCL_USM_SYSTEM
environment variable to enable. If USM system allocations are not supported
by the device or the system, we fallback to regular allocations.

This feature can allow VRAM overcommit. For example, the test below fails
on B580 due to lack of memory for allocation, but it passes when enabling
USM system allocations:

  ./examples/sycl/test.sh -m Qwen3.5-27B-Q3_K_M.gguf -lv 4

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
This commit is contained in:
Francois Dugast 2026-06-17 07:54:21 +02:00 committed by Georgi Gerganov
parent 694579182f
commit dd1a6ca897
2 changed files with 68 additions and 25 deletions

View File

@ -230,6 +230,7 @@ struct sycl_device_info {
size_t total_vram;
sycl_hw_info hw_info;
optimize_feature opt_feature;
bool usm_system_support; // support for USM system allocations
};

View File

@ -72,6 +72,9 @@
#include "ggml-sycl/gated_delta_net.hpp"
#include "ggml-sycl/pool.hpp"
#define MEM_SIZE_2M 0x00200000
#define MEM_SIZE_1G 0x40000000
static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
@ -83,7 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_enable_flash_attention = 1;
int g_ggml_sycl_usm_system = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
@ -137,6 +140,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
info.devices[i].smpbo = prop.get_local_mem_size();
info.devices[i].warp_size = WARP_SIZE;
info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations);
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
@ -274,6 +278,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention = 0;
#endif
g_ggml_sycl_usm_system = ggml_sycl_get_env("GGML_SYCL_USM_SYSTEM", 0);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Build with Macros:\n");
@ -342,6 +348,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention);
#endif
GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system);
/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
@ -417,6 +425,14 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
inline void free_aligned_mem_host(void * memblock) {
#ifdef _WIN32
_aligned_free(memblock);
#else
free(memblock);
#endif
}
// sycl buffer
struct ggml_backend_sycl_buffer_context {
@ -426,9 +442,10 @@ struct ggml_backend_sycl_buffer_context {
std::string name;
optimize_feature opt_feature;
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
bool is_usm_system;
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
device(device), dev_ptr(dev_ptr), stream(stream) {
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) :
device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) {
check_allow_gpu_index(device);
name = (GGML_SYCL_NAME + std::to_string(device));
opt_feature = ggml_sycl_info().devices[device].opt_feature;
@ -437,7 +454,10 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
if (is_usm_system)
free_aligned_mem_host(dev_ptr);
else
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
}
//release extra used by tensors
@ -759,21 +779,59 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
return ctx->name.c_str();
}
static bool check_usm_system(int device, size_t size) {
bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G;
if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) {
GGML_LOG_INFO("Device does not support USM system allocations\n");
use_usm_system = false;
}
return use_usm_system;
}
inline void * aligned_malloc_host(size_t alignment, size_t size) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
return aligned_alloc(alignment, size);
#endif
}
static ggml_backend_buffer_t
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) try {
ggml_check_sycl();
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_sycl_set_device(buft_ctx->device);
const queue_ptr stream = buft_ctx->stream;
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
/*
Alignment below ensures best performance. While in theory it could lead to
wasting memory, this is acceptable because in practice only few buffers are
allocated and even less exceed the minimum size accepted here for USM system
allocations.
*/
size_t alignment = MEM_SIZE_2M;
size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size);
void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
if (use_usm_system) {
dev_ptr = (void *)aligned_malloc_host(alignment, aligned_size);
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
return nullptr;
}
} else {
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
}
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
catch (sycl::exception const &exc) {
@ -1300,22 +1358,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
GGML_UNUSED(buft);
}
inline void * aligned_malloc_host(size_t alignment, size_t size) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
return aligned_alloc(alignment, size);
#endif
}
inline void free_aligned_mem_host(void * memblock) {
#ifdef _WIN32
_aligned_free(memblock);
#else
free(memblock);
#endif
}
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free_aligned_mem_host((void *)buffer->context);
}