diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 6d1953821..31e26ff48 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -224,6 +224,7 @@ struct sycl_device_info { int max_wg_per_cu; // max work groups per compute unit - refer to // cudaOccupancyMaxActiveBlocksPerMultiprocessor bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory size_t total_vram; sycl_hw_info hw_info; optimize_feature opt_feature; @@ -244,6 +245,8 @@ struct ggml_sycl_device_info { const ggml_sycl_device_info & ggml_sycl_info(); +static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128; + struct ggml_sycl_pool { virtual ~ggml_sycl_pool() = default; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b3fbb6211..729a88b4d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,11 @@ #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC # include #endif +#if SYCL_EXT_ONEAPI_VIRTUAL_MEM +# include +# include +# define GGML_SYCL_USE_VMM +#endif #include #include "ggml.h" @@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; +int g_ggml_sycl_enable_vmm = 1; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_use_async_mem_op_requested = 1; @@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() { // GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); // #endif for (int i = 0; i < info.device_count; ++i) { - info.devices[i].vmm = 0; dpct::device_info prop; auto & device = dpct::dev_mgr::instance().get_device(i); SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( prop, device))); +#if !defined(GGML_SYCL_USE_VMM) + info.devices[i].vmm = 0; +#else + info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem); + if (info.devices[i].vmm) { + // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity, + // but the L0 API requires a larger page size for allocs above 2 MiB and + // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic]. + // Here we clamp it to 2 MiB for simplicity, but other devices may require + // calling zeVirtualMemQueryPageSize or yet unexposed public API. + const size_t physical_page = 2ull << 20; // 2 MiB + info.devices[i].vmm_granularity = std::max( + sycl::ext::oneapi::experimental::get_mem_granularity( + device, sycl::context(device)), + physical_page); + } +#endif + info.default_tensor_split[i] = total_vram; total_vram += prop.get_global_mem_size(); @@ -234,6 +258,7 @@ static void ggml_check_sycl() try { g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); + g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero); @@ -275,6 +300,11 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n"); #endif +#if defined(GGML_SYCL_USE_VMM) + GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n"); +#else + GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n"); +#endif GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); @@ -293,6 +323,11 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); #else GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); +#endif +#if defined(GGML_SYCL_USE_VMM) + GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm); +#else + GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); @@ -754,7 +789,7 @@ catch (sycl::exception const &exc) { } static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; + return SYCL_BUFFER_ALIGNMENT; GGML_UNUSED(buft); } @@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg } static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; + return SYCL_BUFFER_ALIGNMENT; GGML_UNUSED(buft); } @@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } }; +// pool with virtual memory management +#if defined(GGML_SYCL_USE_VMM) +struct ggml_sycl_pool_vmm : public ggml_sycl_pool { + static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + + int device; + sycl::context ctx; + sycl::device dev; + + uintptr_t pool_addr = 0; + size_t pool_used = 0; + size_t pool_size = 0; + size_t granularity; + + // physical_mem owns the commits (unlike cuMemMap) + struct mapping { + sycl::ext::oneapi::experimental::physical_mem phys; + void * map_ptr; + }; + std::vector mappings; + + explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) : + device(device_), + ctx(qptr_->get_context()), + dev(qptr_->get_device()), + granularity(ggml_sycl_info().devices[device_].vmm_granularity) { + } + + ~ggml_sycl_pool_vmm() { + if (pool_addr == 0) { + return; + } + + // Per spec, unmap must (a) match the exact (ptr, size) of an earlier + // physical_mem::map() call and (b) precede destruction of the + // physical_mem objects (their dtors won't unmap). + for (auto & m : mappings) { + SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap( + m.map_ptr, m.phys.size(), ctx))); + } + SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem( + pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx))); + } + + void * alloc(size_t size, size_t * actual_size) override { + // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types + size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT); + + size_t avail = pool_size - pool_used; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = GGML_PAD(size - avail, granularity); + + GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + std::optional phys; + SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size))); + + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + SYCL_CHECK(CHECK_TRY_ERROR( + pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem( + SYCL_POOL_VMM_MAX_SIZE, ctx))); + } + + // map at the end of the pool + void * map_ptr = nullptr; + SYCL_CHECK(CHECK_TRY_ERROR( + map_ptr = phys->map(pool_addr + pool_size, reserve_size, + sycl::ext::oneapi::experimental::address_access_mode::read_write))); + + // stash these so we could unmap this exact range in dtor + mappings.push_back({ + std::move(*phys), + map_ptr, + }); + + // add to the pool + pool_size += reserve_size; + +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + device, (unsigned long long) (pool_size/1024/1024), + (unsigned long long) (reserve_size/1024/1024)); +#endif + } + + GGML_ASSERT(pool_addr != 0); + + void * ptr = reinterpret_cast(pool_addr + pool_used); + *actual_size = size; + pool_used += size; + +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr); +#endif + + return ptr; + } + + void free(void * ptr, size_t size) override { +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr); +#endif + + pool_used -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == reinterpret_cast(pool_addr + pool_used)); + } +}; +#endif // defined(GGML_SYCL_USE_VMM) + struct ggml_sycl_pool_host : public ggml_sycl_pool { queue_ptr qptr; int device; @@ -1542,20 +1692,19 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_host(que } std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { - // TBD: NO VMM support - // if (ggml_sycl_info().devices[device].vmm) { - // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); - // } - return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); +#if defined(GGML_SYCL_USE_VMM) + if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) { + return std::unique_ptr(new ggml_sycl_pool_vmm(qptr, device)); + } +#endif // defined(GGML_SYCL_USE_VMM) + return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); } + std::unique_ptr ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) { return std::unique_ptr(new ggml_sycl_fattn_kv_buffers(qptr, device)); } -// TBD pool with virtual memory management -// struct ggml_sycl_pool_vmm : public ggml_sycl_pool - /// kernels typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx,