Remove per-allocation Level Zero runtime checks (llama/23399)
* [SYCL] Centralize Level Zero detection in ggml_sycl_init * use the same wording * get back the warning * [SYCL] Remove per-allocation getenv() for GGML_SYCL_ENABLE_LEVEL_ZERO * bring back the comment * move it up to make sure devices call the shots * move the env detection early * replace g_ggml_sycl_enable_level_zero with a direct call to .ext_oneapi_level_zero * update the comment * switch back to g_ggml_sycl_enable_level_zero with a sentinel * remove the check * Reduce the diff * reword, move lower * move things aroudn * remove forward declaration if favor of a full replace * pre-cache results of zeDeviceGetProperties * put ggml_sycl_get_env back * replace get_sycl_env with ggml_sycl_get_env * add whitespace back * Apply suggestion from @sanmai
This commit is contained in:
parent
1b2d6d2c23
commit
5832e734d4
|
|
@ -59,7 +59,7 @@ bool gpu_has_xmx(sycl::device &dev) {
|
|||
return dev.has(sycl::aspect::ext_intel_matrix);
|
||||
}
|
||||
|
||||
static int ggml_sycl_get_env(const char *env_name, int default_val) {
|
||||
int ggml_sycl_get_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
|||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) &&
|
||||
return g_ggml_sycl_enable_level_zero &&
|
||||
q.get_device().is_gpu() &&
|
||||
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
|
||||
}
|
||||
|
|
@ -94,8 +94,6 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
|||
|
||||
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
|
||||
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
|
||||
// The decision is made from the queue and runtime env because large buffers can be
|
||||
// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero.
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
|
|
|
|||
|
|
@ -225,6 +225,7 @@ struct sycl_device_info {
|
|||
int max_wg_per_cu; // max work groups per compute unit - refer to
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
bool l0_discrete_gpu; // Level Zero backend and not an integrated GPU
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
|
|
@ -644,6 +645,8 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
|||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
int ggml_sycl_get_env(const char *env_name, int default_val);
|
||||
|
||||
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return "";
|
||||
|
|
|
|||
|
|
@ -147,11 +147,31 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|||
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
||||
info.ext_oneapi_level_zero = false;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) {
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.default_queue().get_device());
|
||||
ze_device_properties_t props = {};
|
||||
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
||||
info.devices[i].l0_discrete_gpu = r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
info.default_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Large buffers can be allocated before ggml_check_sycl() initializes other
|
||||
// g_ggml_sycl_enable_* globals, so initialize this one as early as we can.
|
||||
g_ggml_sycl_enable_level_zero =
|
||||
info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
|
@ -236,38 +256,19 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|||
print_device_opt_feature(device_count);
|
||||
}
|
||||
|
||||
static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
unsigned n;
|
||||
if (user_device_string != NULL &&
|
||||
sscanf(user_device_string, " %u", &n) == 1) {
|
||||
user_number = (int)n;
|
||||
} else {
|
||||
user_number = default_val;
|
||||
}
|
||||
return user_number;
|
||||
}
|
||||
|
||||
static void ggml_check_sycl() try {
|
||||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
g_ggml_sycl_debug = ggml_sycl_get_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize = ggml_sycl_get_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = ggml_sycl_get_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = ggml_sycl_get_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_flash_attention = 0;
|
||||
#endif
|
||||
|
|
@ -330,7 +331,7 @@ static void ggml_check_sycl() try {
|
|||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
g_ggml_sycl_use_async_mem_op_requested = ggml_sycl_get_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
|
|
@ -569,26 +570,18 @@ catch (sycl::exception const &exc) {
|
|||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
|
||||
if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
||||
ze_device_properties_t props = {};
|
||||
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
||||
return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(int device) {
|
||||
return ggml_sycl_info().devices[device].l0_discrete_gpu;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
||||
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported =
|
||||
ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
|
||||
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
||||
const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
|
||||
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
|
||||
if (l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
|
|
@ -651,7 +644,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|||
size_t size = ggml_nbytes(src);
|
||||
|
||||
//todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
|
||||
dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
|
||||
dev2dev_memcpy(dst_ctx->device, *stream_dst, src_ctx->device, *stream_src, dst->data, src->data, size);
|
||||
|
||||
//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
|
||||
#if 0
|
||||
|
|
@ -3056,7 +3049,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
|
||||
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(i, *stream, ctx.device, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue