add dev2dev memcpy by SYCL API (llama/24476)
* add dev2dev memcpy by SYCL API * mv GGML_SYCL_DEV2DEV_MEMCPY to runntime table * update the detect method for p2p comm * fix the erro created during fix confilct --------- Co-authored-by: Neo Zhang <NA>
This commit is contained in:
parent
01b1c3ada7
commit
6772827b26
|
|
@ -62,6 +62,7 @@ extern int g_ggml_sycl_debug;
|
|||
extern int g_ggml_sycl_disable_optimize;
|
||||
extern int g_ggml_sycl_prioritize_dmmv;
|
||||
extern int g_ggml_sycl_enable_flash_attention;
|
||||
extern int g_ggml_sycl_dev2dev_memcpy;
|
||||
|
||||
|
||||
#if defined(__clang__) && __has_builtin(__builtin_expect)
|
||||
|
|
@ -126,6 +127,11 @@ enum ggml_sycl_backend_gpu_mode {
|
|||
SYCL_MUL_GPU_MODE
|
||||
};
|
||||
|
||||
enum ggml_sycl_dev2dev_memcpy_mode {
|
||||
DEV2DEV_MEMCPY_SYCL = 0,
|
||||
DEV2DEV_MEMCPY_L0 = 1,
|
||||
};
|
||||
|
||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
static void crash() {
|
||||
|
|
|
|||
|
|
@ -13,14 +13,14 @@
|
|||
#ifndef GGML_SYCL_DPCT_HELPER_HPP
|
||||
#define GGML_SYCL_DPCT_HELPER_HPP
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/mman.h>
|
||||
#elif defined(_WIN64)
|
||||
|
|
@ -43,6 +43,7 @@
|
|||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define DPCT_COMPATIBILITY_TEMP (900)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
|
@ -59,6 +60,13 @@
|
|||
#define __dpct_noinline__ __attribute__((noinline))
|
||||
#endif
|
||||
|
||||
#define DPCT_UNUSED(x) (void)(x)
|
||||
|
||||
inline void _abort(const char * str) {
|
||||
std::cerr << str << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
inline std::string get_device_type_name(const sycl::device &Device) {
|
||||
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
||||
switch (DeviceType) {
|
||||
|
|
@ -1017,7 +1025,7 @@ namespace dpct
|
|||
if (backend == "opencl:cpu") return 4;
|
||||
if (backend == "opencl:acc") return 5;
|
||||
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
||||
GGML_ABORT("fatal error");
|
||||
_abort("fatal error");
|
||||
}
|
||||
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
||||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||
|
|
@ -1426,7 +1434,7 @@ namespace dpct
|
|||
if (!size)
|
||||
return sycl::event{};
|
||||
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
||||
GGML_UNUSED(direction);
|
||||
DPCT_UNUSED(direction);
|
||||
}
|
||||
|
||||
// Get actual copy range and make sure it will not exceed range.
|
||||
|
|
@ -2092,7 +2100,7 @@ namespace dpct
|
|||
if (!size)
|
||||
return sycl::event{};
|
||||
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
||||
GGML_UNUSED(direction);
|
||||
DPCT_UNUSED(direction);
|
||||
}
|
||||
|
||||
// Get actual copy range and make sure it will not exceed range.
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
|
|||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
int g_ggml_sycl_enable_level_zero = 0;
|
||||
int g_ggml_sycl_enable_flash_attention = 1;
|
||||
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
|
||||
int g_ggml_sycl_usm_system = 0;
|
||||
|
||||
static ggml_sycl_device_info ggml_sycl_init() {
|
||||
|
|
@ -272,6 +273,11 @@ static void ggml_check_sycl() try {
|
|||
g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
|
||||
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
|
||||
if (g_ggml_sycl_enable_level_zero == 0) {
|
||||
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
|
||||
}
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
#else
|
||||
|
|
@ -324,8 +330,11 @@ static void ggml_check_sycl() try {
|
|||
#endif
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
|
||||
g_ggml_sycl_dev2dev_memcpy);
|
||||
#endif
|
||||
#if GGML_SYCL_DNNL
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
|
|
@ -598,27 +607,42 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
|
|||
|
||||
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
|
||||
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
|
||||
if (l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
||||
ze_command_list_handle_t cl;
|
||||
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
|
||||
if (r == ZE_RESULT_SUCCESS) {
|
||||
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
|
||||
zeCommandListDestroy(cl);
|
||||
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported =
|
||||
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
|
||||
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
||||
ze_command_list_handle_t cl;
|
||||
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
|
||||
if (r == ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by L0\n");
|
||||
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
|
||||
zeCommandListDestroy(cl);
|
||||
if (r == ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_SYCL) {
|
||||
if (q_dst.get_device().ext_oneapi_can_access_peer(q_src.get_device(),
|
||||
sycl::ext::oneapi::peer_access::access_supported)) {
|
||||
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by SYCL\n");
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(q_dst.memcpy(ptr_dst, ptr_src, size).wait()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Host-staged copy
|
||||
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by host forward\n");
|
||||
char *host_buf = (char *)malloc(size);
|
||||
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
||||
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
|
||||
|
|
|
|||
Loading…
Reference in New Issue