diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5fb1a1d6b..9ec94464b 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -230,6 +230,7 @@ struct sycl_device_info { size_t total_vram; sycl_hw_info hw_info; optimize_feature opt_feature; + bool usm_system_support; // support for USM system allocations }; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0900fade6..f029f6325 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -72,6 +72,9 @@ #include "ggml-sycl/gated_delta_net.hpp" #include "ggml-sycl/pool.hpp" +#define MEM_SIZE_2M 0x00200000 +#define MEM_SIZE_1G 0x40000000 + static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; @@ -83,7 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; - +int g_ggml_sycl_usm_system = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -137,6 +140,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.devices[i].smpbo = prop.get_local_mem_size(); info.devices[i].warp_size = WARP_SIZE; + info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units(); @@ -274,6 +278,8 @@ static void ggml_check_sycl() try { g_ggml_sycl_enable_flash_attention = 0; #endif + g_ggml_sycl_usm_system = ggml_sycl_get_env("GGML_SYCL_USM_SYSTEM", 0); + GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Build with Macros:\n"); @@ -342,6 +348,8 @@ static void ggml_check_sycl() try { g_ggml_sycl_enable_flash_attention); #endif + GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system); + /* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); @@ -417,6 +425,14 @@ catch (sycl::exception const &exc) { std::exit(1); } +inline void free_aligned_mem_host(void * memblock) { +#ifdef _WIN32 + _aligned_free(memblock); +#else + free(memblock); +#endif +} + // sycl buffer struct ggml_backend_sycl_buffer_context { @@ -426,9 +442,10 @@ struct ggml_backend_sycl_buffer_context { std::string name; optimize_feature opt_feature; std::vector tensor_extras; + bool is_usm_system; - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { + ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) : + device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) { check_allow_gpu_index(device); name = (GGML_SYCL_NAME + std::to_string(device)); opt_feature = ggml_sycl_info().devices[device].opt_feature; @@ -437,7 +454,10 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); + if (is_usm_system) + free_aligned_mem_host(dev_ptr); + else + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); } //release extra used by tensors @@ -759,21 +779,59 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t return ctx->name.c_str(); } +static bool check_usm_system(int device, size_t size) { + bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G; + + if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) { + GGML_LOG_INFO("Device does not support USM system allocations\n"); + use_usm_system = false; + } + + return use_usm_system; +} + +inline void * aligned_malloc_host(size_t alignment, size_t size) { +#ifdef _WIN32 + return _aligned_malloc(size, alignment); +#else + return aligned_alloc(alignment, size); +#endif +} + static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { + ggml_check_sycl(); + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; ggml_sycl_set_device(buft_ctx->device); const queue_ptr stream = buft_ctx->stream; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 + /* + Alignment below ensures best performance. While in theory it could lead to + wasting memory, this is acceptable because in practice only few buffers are + allocated and even less exceed the minimum size accepted here for USM system + allocations. + */ + size_t alignment = MEM_SIZE_2M; + size_t aligned_size = ((size + alignment - 1) / alignment) * alignment; + bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size); void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); - if (!dev_ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); - return nullptr; + if (use_usm_system) { + dev_ptr = (void *)aligned_malloc_host(alignment, aligned_size); + if (!dev_ptr) { + GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size); + return nullptr; + } + } else { + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); + if (!dev_ptr) { + GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); + return nullptr; + } } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { @@ -1300,22 +1358,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ GGML_UNUSED(buft); } -inline void * aligned_malloc_host(size_t alignment, size_t size) { -#ifdef _WIN32 - return _aligned_malloc(size, alignment); -#else - return aligned_alloc(alignment, size); -#endif -} - -inline void free_aligned_mem_host(void * memblock) { -#ifdef _WIN32 - _aligned_free(memblock); -#else - free(memblock); -#endif -} - static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { free_aligned_mem_host((void *)buffer->context); }