SYCL: implement ggml_sycl_pool_vmm (llama/22862)
* SYCL: implement ggml_sycl_pool_vmm * Add an option to bypass VMM with GGML_SYCL_DISABLE_VMM * Clean up debugging logging * document GGML_SYCL_DISABLE_VMM * Multi-stream MoE optimization * Revert "Multi-stream MoE optimization" This reverts commit 938929c3f13a562ec67c59e87cc5d38595444cce. * Update common.hpp Co-authored-by: Neo Zhang <zhang.jianyu@outlook.com> * Flip GGML_SYCL_DISABLE_VMM to GGML_SYCL_ENABLE_VMM * add logging for GGML_SYCL_ENABLE_VMM when extension is not available (SYCL_EXT_ONEAPI_VIRTUAL_MEM macro) * Apply suggestions from code review Co-authored-by: Alexey Kopytko <alexey@kopytko.com> * Apply suggestion from @sanmai * Apply suggestion from @sanmai --------- Co-authored-by: Neo Zhang <zhang.jianyu@outlook.com>
This commit is contained in:
parent
00a5110b19
commit
049f0af339
|
|
@ -224,6 +224,7 @@ struct sycl_device_info {
|
|||
int max_wg_per_cu; // max work groups per compute unit - refer to
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
|
|
@ -244,6 +245,8 @@ struct ggml_sycl_device_info {
|
|||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
|
||||
static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
|
||||
|
||||
struct ggml_sycl_pool {
|
||||
virtual ~ggml_sycl_pool() = default;
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@
|
|||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
|
@ -37,6 +38,11 @@
|
|||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
||||
#endif
|
||||
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
|
||||
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
|
||||
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
|
||||
# define GGML_SYCL_USE_VMM
|
||||
#endif
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
|
|
@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
|
|||
int g_ggml_sycl_disable_optimize = 0;
|
||||
int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_enable_vmm = 1;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
|
|
@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|||
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
||||
// #endif
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
auto & device = dpct::dev_mgr::instance().get_device(i);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, device)));
|
||||
|
||||
#if !defined(GGML_SYCL_USE_VMM)
|
||||
info.devices[i].vmm = 0;
|
||||
#else
|
||||
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
|
||||
if (info.devices[i].vmm) {
|
||||
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
|
||||
// but the L0 API requires a larger page size for allocs above 2 MiB and
|
||||
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
|
||||
// Here we clamp it to 2 MiB for simplicity, but other devices may require
|
||||
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
|
||||
const size_t physical_page = 2ull << 20; // 2 MiB
|
||||
info.devices[i].vmm_granularity = std::max<size_t>(
|
||||
sycl::ext::oneapi::experimental::get_mem_granularity(
|
||||
device, sycl::context(device)),
|
||||
physical_page);
|
||||
}
|
||||
#endif
|
||||
|
||||
info.default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
|
|
@ -234,6 +258,7 @@ static void ggml_check_sycl() try {
|
|||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
|
|
@ -275,6 +300,11 @@ static void ggml_check_sycl() try {
|
|||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
|
|
@ -293,6 +323,11 @@ static void ggml_check_sycl() try {
|
|||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
|
|
@ -754,7 +789,7 @@ catch (sycl::exception const &exc) {
|
|||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
|
|
@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
|
|||
}
|
||||
|
||||
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
|
|
@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|||
}
|
||||
};
|
||||
|
||||
// pool with virtual memory management
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
|
||||
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
int device;
|
||||
sycl::context ctx;
|
||||
sycl::device dev;
|
||||
|
||||
uintptr_t pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
size_t granularity;
|
||||
|
||||
// physical_mem owns the commits (unlike cuMemMap)
|
||||
struct mapping {
|
||||
sycl::ext::oneapi::experimental::physical_mem phys;
|
||||
void * map_ptr;
|
||||
};
|
||||
std::vector<mapping> mappings;
|
||||
|
||||
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
|
||||
device(device_),
|
||||
ctx(qptr_->get_context()),
|
||||
dev(qptr_->get_device()),
|
||||
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
|
||||
}
|
||||
|
||||
~ggml_sycl_pool_vmm() {
|
||||
if (pool_addr == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
|
||||
// physical_mem::map() call and (b) precede destruction of the
|
||||
// physical_mem objects (their dtors won't unmap).
|
||||
for (auto & m : mappings) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
|
||||
m.map_ptr, m.phys.size(), ctx)));
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
|
||||
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
||||
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
|
||||
|
||||
size_t avail = pool_size - pool_used;
|
||||
|
||||
if (size > avail) {
|
||||
// round up to the next multiple of the granularity
|
||||
size_t reserve_size = GGML_PAD(size - avail, granularity);
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
|
||||
|
||||
// allocate more physical memory
|
||||
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
|
||||
|
||||
// reserve virtual address space (if not already reserved)
|
||||
if (pool_addr == 0) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
||||
SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
void * map_ptr = nullptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
|
||||
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
|
||||
|
||||
// stash these so we could unmap this exact range in dtor
|
||||
mappings.push_back({
|
||||
std::move(*phys),
|
||||
map_ptr,
|
||||
});
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||
device, (unsigned long long) (pool_size/1024/1024),
|
||||
(unsigned long long) (reserve_size/1024/1024));
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
|
||||
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
|
||||
*actual_size = size;
|
||||
pool_used += size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void free(void * ptr, size_t size) override {
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
pool_used -= size;
|
||||
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
|
||||
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
||||
queue_ptr qptr;
|
||||
int device;
|
||||
|
|
@ -1542,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
|
|||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
|
||||
// }
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
|
||||
}
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
||||
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
/// kernels
|
||||
typedef void (*ggml_sycl_op_mul_mat_t)(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
|
|
|
|||
Loading…
Reference in New Issue