opencl: add opt-in Adreno xmem F16xF32 GEMM for prefill (llama/22755)
* ggml-opencl: add Adreno xmem F16xF32 GEMM for prefill * ggml-opencl: address Adreno xmem review comments * ggml-opencl: align xmem gemm kernel naming --------- Co-authored-by: Your Name <your@email.com>
This commit is contained in:
parent
e8a7cd314f
commit
1caed1d2ba
|
|
@ -176,6 +176,10 @@ set(GGML_OPENCL_KERNELS
|
|||
flash_attn_f32
|
||||
)
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8)
|
||||
endif ()
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
ggml_opencl_add_kernel(${K})
|
||||
endforeach()
|
||||
|
|
|
|||
|
|
@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
|
|||
|
||||
cl_bool non_uniform_workgroups;
|
||||
size_t image_max_buffer_size;
|
||||
size_t image2d_max_width;
|
||||
size_t image2d_max_height;
|
||||
|
||||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
|
|
@ -420,6 +422,11 @@ struct ggml_backend_opencl_context {
|
|||
ggml_cl_buffer prealloc_src0;
|
||||
ggml_cl_buffer prealloc_src1;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
ggml_cl_buffer prealloc_adreno_xmem_const;
|
||||
bool adreno_xmem_gemm_enabled = false;
|
||||
#endif
|
||||
|
||||
// prealloc buffers for MoE router table preprocess
|
||||
bool toggle_reorder = false;
|
||||
ggml_cl_buffer prealloc_post_router;
|
||||
|
|
@ -538,6 +545,10 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_mul_mat_f16_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4;
|
||||
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
||||
cl_kernel kernel_adreno_xmem_pack_src_f32;
|
||||
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
|
||||
cl_kernel kernel_gemm_xmem_f16_f32_os8;
|
||||
cl_kernel kernel_adreno_xmem_store_dst_f32;
|
||||
cl_kernel kernel_mul_mm_f16_f32_kqv;
|
||||
cl_kernel kernel_mul_mm_f16_f32_kq;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
|
|
@ -1554,6 +1565,32 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// gemm_xmem_f16_f32_os8
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_xmem_f16_f32_os8.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_xmem_f16_f32_os8.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_adreno_xmem_pack_src_f32 =
|
||||
clCreateKernel(prog, "adreno_xmem_pack_src_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_adreno_xmem_prepack_weight_f16 =
|
||||
clCreateKernel(prog, "adreno_xmem_prepack_weight_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gemm_xmem_f16_f32_os8 =
|
||||
clCreateKernel(prog, "kernel_gemm_xmem_f16_f32_os8", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_adreno_xmem_store_dst_f32 =
|
||||
clCreateKernel(prog, "adreno_xmem_store_dst_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// mul_mm_f32_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -3473,6 +3510,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
|
||||
GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
|
||||
GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
|
||||
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
|
||||
|
||||
|
|
@ -3511,6 +3552,16 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
|
||||
backend_ctx->adreno_xmem_gemm_enabled ?
|
||||
"enabled (temporary weight prepack)" : "requested but unsupported by this driver");
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// determine whether to use large buffer for Adreno
|
||||
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
|
|
@ -9920,6 +9971,169 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
|
|||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32(
|
||||
const ggml_backend_opencl_context * backend_ctx,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
const ggml_tensor * dst) {
|
||||
if (!backend_ctx->adreno_xmem_gemm_enabled) {
|
||||
return false;
|
||||
}
|
||||
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
return false;
|
||||
}
|
||||
if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
if (src0->ne[2] != 1 || src0->ne[3] != 1 ||
|
||||
src1->ne[2] != 1 || src1->ne[3] != 1 ||
|
||||
dst->ne[2] != 1 || dst->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
const int K = src0->ne[0];
|
||||
const int M = src0->ne[1];
|
||||
const int N = src1->ne[1];
|
||||
if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) {
|
||||
return false;
|
||||
}
|
||||
if (N <= 1 || M < 64 || N < 16 || K < 64) {
|
||||
return false;
|
||||
}
|
||||
if ((K % 8) != 0) {
|
||||
return false;
|
||||
}
|
||||
const int kpack = K / 4;
|
||||
const int npack = CEIL_DIV(M, 4);
|
||||
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
|
||||
static_cast<size_t>(kpack) > backend_ctx->image2d_max_height) {
|
||||
return false;
|
||||
}
|
||||
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
|
||||
static_cast<size_t>(npack) > backend_ctx->image2d_max_height) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f16_f32_adreno_xmem(
|
||||
ggml_backend_t backend,
|
||||
const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
ggml_tensor * dst) {
|
||||
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
const cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
const cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
const cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int K = src0->ne[0];
|
||||
const int M = src0->ne[1];
|
||||
const int N = src1->ne[1];
|
||||
const int kpack = K / 4;
|
||||
const int npack = CEIL_DIV(M, 4);
|
||||
const int os = 8;
|
||||
|
||||
const size_t xmem_bytes = 6144;
|
||||
const size_t weight_bytes = static_cast<size_t>(kpack) * static_cast<size_t>(npack) * 4u * sizeof(cl_half4);
|
||||
|
||||
backend_ctx->prealloc_adreno_xmem_const.allocate(backend_ctx->context, xmem_bytes);
|
||||
|
||||
cl_int err = CL_SUCCESS;
|
||||
cl_image_format fmt = {};
|
||||
fmt.image_channel_order = CL_RGBA;
|
||||
fmt.image_channel_data_type = CL_HALF_FLOAT;
|
||||
|
||||
cl_image_desc desc_src = {};
|
||||
desc_src.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc_src.image_width = static_cast<size_t>(N);
|
||||
desc_src.image_height = static_cast<size_t>(kpack);
|
||||
cl_mem src_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_src, nullptr, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_image_desc desc_dst = {};
|
||||
desc_dst.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc_dst.image_width = static_cast<size_t>(N);
|
||||
desc_dst.image_height = static_cast<size_t>(npack);
|
||||
cl_mem dst_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_dst, nullptr, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem weights = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, weight_bytes, nullptr, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel prepack = backend_ctx->kernel_adreno_xmem_prepack_weight_f16;
|
||||
CL_CHECK(clSetKernelArg(prepack, 0, sizeof(cl_mem), &weights));
|
||||
CL_CHECK(clSetKernelArg(prepack, 1, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(prepack, 2, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(prepack, 3, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(prepack, 4, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(prepack, 5, sizeof(int), &kpack));
|
||||
CL_CHECK(clSetKernelArg(prepack, 6, sizeof(int), &npack));
|
||||
CL_CHECK(clSetKernelArg(prepack, 7, sizeof(int), &os));
|
||||
size_t lws = 256;
|
||||
size_t max_wg = backend_ctx->get_kernel_workgroup_size(prepack);
|
||||
if (lws > max_wg) {
|
||||
lws = max_wg;
|
||||
}
|
||||
size_t gws = CEIL_DIV(static_cast<size_t>(kpack) * static_cast<size_t>(npack), lws) * lws;
|
||||
backend_ctx->enqueue_ndrange_kernel(prepack, 1, &gws, &lws, dst);
|
||||
|
||||
cl_kernel pack_src = backend_ctx->kernel_adreno_xmem_pack_src_f32;
|
||||
CL_CHECK(clSetKernelArg(pack_src, 0, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(pack_src, 1, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(pack_src, 2, sizeof(cl_mem), &src_img));
|
||||
CL_CHECK(clSetKernelArg(pack_src, 3, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(pack_src, 4, sizeof(int), &N));
|
||||
size_t pack_src_lws[2] = { 16, 16 };
|
||||
size_t pack_src_gws[2] = {
|
||||
CEIL_DIV(static_cast<size_t>(N), pack_src_lws[0])*pack_src_lws[0],
|
||||
CEIL_DIV(static_cast<size_t>(kpack), pack_src_lws[1])*pack_src_lws[1]
|
||||
};
|
||||
backend_ctx->enqueue_ndrange_kernel(pack_src, 2, pack_src_gws, pack_src_lws, dst);
|
||||
|
||||
cl_kernel gemm = backend_ctx->kernel_gemm_xmem_f16_f32_os8;
|
||||
CL_CHECK(clSetKernelArg(gemm, 0, sizeof(cl_mem), &weights));
|
||||
CL_CHECK(clSetKernelArg(gemm, 1, sizeof(cl_mem), &backend_ctx->prealloc_adreno_xmem_const.buffer));
|
||||
CL_CHECK(clSetKernelArg(gemm, 2, sizeof(cl_mem), &src_img));
|
||||
CL_CHECK(clSetKernelArg(gemm, 3, sizeof(cl_mem), &dst_img));
|
||||
CL_CHECK(clSetKernelArg(gemm, 4, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(gemm, 5, sizeof(int), &npack));
|
||||
CL_CHECK(clSetKernelArg(gemm, 6, sizeof(int), &kpack));
|
||||
const size_t z_values = CEIL_DIV(static_cast<size_t>(npack), static_cast<size_t>(os));
|
||||
size_t gemm_lws[3] = { 64, 1, 1 };
|
||||
size_t gemm_gws[3] = {
|
||||
z_values*gemm_lws[0],
|
||||
CEIL_DIV(static_cast<size_t>(N), gemm_lws[0]),
|
||||
1
|
||||
};
|
||||
backend_ctx->enqueue_ndrange_kernel(gemm, 3, gemm_gws, gemm_lws, dst);
|
||||
|
||||
cl_kernel store_dst = backend_ctx->kernel_adreno_xmem_store_dst_f32;
|
||||
CL_CHECK(clSetKernelArg(store_dst, 0, sizeof(cl_mem), &dst_img));
|
||||
CL_CHECK(clSetKernelArg(store_dst, 1, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(store_dst, 2, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(store_dst, 3, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(store_dst, 4, sizeof(int), &N));
|
||||
size_t store_lws[2] = { 16, 16 };
|
||||
size_t store_gws[2] = {
|
||||
CEIL_DIV(static_cast<size_t>(N), store_lws[0])*store_lws[0],
|
||||
CEIL_DIV(static_cast<size_t>(npack), store_lws[1])*store_lws[1]
|
||||
};
|
||||
backend_ctx->enqueue_ndrange_kernel(store_dst, 2, store_gws, store_lws, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(weights));
|
||||
CL_CHECK(clReleaseMemObject(dst_img));
|
||||
CL_CHECK(clReleaseMemObject(src_img));
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
|
@ -11681,6 +11895,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
return;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (ggml_cl_can_use_adreno_xmem_gemm_f16_f32(backend_ctx, src0, src1, dst)) {
|
||||
ggml_cl_mul_mat_f16_f32_adreno_xmem(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,233 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load : enable
|
||||
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void adreno_xmem_pack_src_f32(
|
||||
__global const void * src_void,
|
||||
ulong offset,
|
||||
__write_only image2d_t src_img,
|
||||
int K,
|
||||
int N) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
const int kpack = K / 4;
|
||||
|
||||
if (x >= N || y >= kpack) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global const float * src = (__global const float *)((__global const char *)src_void + offset);
|
||||
const int base = x*K + y*4;
|
||||
const half4 v = (half4)((half)src[base + 0], (half)src[base + 1], (half)src[base + 2], (half)src[base + 3]);
|
||||
write_imageh(src_img, (int2)(x, y), v);
|
||||
}
|
||||
|
||||
__kernel void adreno_xmem_prepack_weight_f16(
|
||||
__global half4 * dst,
|
||||
__global const void * src_void,
|
||||
ulong offset,
|
||||
int K,
|
||||
int M,
|
||||
int kpack,
|
||||
int npack,
|
||||
int os) {
|
||||
const int linear = get_global_id(0);
|
||||
const int total = kpack*npack;
|
||||
if (linear >= total) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global const half * src = (__global const half *)((__global const char *)src_void + offset);
|
||||
|
||||
const int dst_ogroup = linear % os;
|
||||
const int dst_o_sp_i = linear / os;
|
||||
const int dst_i = dst_o_sp_i % kpack;
|
||||
const int dst_o = dst_o_sp_i / kpack;
|
||||
const int o_slice = dst_o*os + dst_ogroup;
|
||||
const int k_base = dst_i*4;
|
||||
|
||||
half4 w0 = (half4)(0.0h);
|
||||
half4 w1 = (half4)(0.0h);
|
||||
half4 w2 = (half4)(0.0h);
|
||||
half4 w3 = (half4)(0.0h);
|
||||
|
||||
const int o0 = o_slice*4 + 0;
|
||||
const int o1 = o_slice*4 + 1;
|
||||
const int o2 = o_slice*4 + 2;
|
||||
const int o3 = o_slice*4 + 3;
|
||||
|
||||
if (k_base + 0 < K) {
|
||||
if (o0 < M) w0.s0 = src[o0*K + k_base + 0];
|
||||
if (o1 < M) w0.s1 = src[o1*K + k_base + 0];
|
||||
if (o2 < M) w0.s2 = src[o2*K + k_base + 0];
|
||||
if (o3 < M) w0.s3 = src[o3*K + k_base + 0];
|
||||
}
|
||||
if (k_base + 1 < K) {
|
||||
if (o0 < M) w1.s0 = src[o0*K + k_base + 1];
|
||||
if (o1 < M) w1.s1 = src[o1*K + k_base + 1];
|
||||
if (o2 < M) w1.s2 = src[o2*K + k_base + 1];
|
||||
if (o3 < M) w1.s3 = src[o3*K + k_base + 1];
|
||||
}
|
||||
if (k_base + 2 < K) {
|
||||
if (o0 < M) w2.s0 = src[o0*K + k_base + 2];
|
||||
if (o1 < M) w2.s1 = src[o1*K + k_base + 2];
|
||||
if (o2 < M) w2.s2 = src[o2*K + k_base + 2];
|
||||
if (o3 < M) w2.s3 = src[o3*K + k_base + 2];
|
||||
}
|
||||
if (k_base + 3 < K) {
|
||||
if (o0 < M) w3.s0 = src[o0*K + k_base + 3];
|
||||
if (o1 < M) w3.s1 = src[o1*K + k_base + 3];
|
||||
if (o2 < M) w3.s2 = src[o2*K + k_base + 3];
|
||||
if (o3 < M) w3.s3 = src[o3*K + k_base + 3];
|
||||
}
|
||||
|
||||
dst[linear*4 + 0] = w0;
|
||||
dst[linear*4 + 1] = w1;
|
||||
dst[linear*4 + 2] = w2;
|
||||
dst[linear*4 + 3] = w3;
|
||||
}
|
||||
|
||||
__attribute__((qcom_max_concurrent_subgroups(12)))
|
||||
__kernel void kernel_gemm_xmem_f16_f32_os8(
|
||||
__constant half8 * weights_buffer __attribute__((sub_group_uniform)),
|
||||
__constant half8 * xmem_buffer __attribute__((max_constant_size((6144)))),
|
||||
__read_only image2d_t src_img,
|
||||
__write_only image2d_t dst_img,
|
||||
int N,
|
||||
int npack,
|
||||
int kpack) {
|
||||
const int X = get_group_id(1)*get_local_size(0) + get_local_id(0);
|
||||
const int Z = get_group_id(0)*get_local_size(2) + get_local_id(2);
|
||||
|
||||
if (X >= N || Z*8 >= npack) {
|
||||
return;
|
||||
}
|
||||
|
||||
half4 r0 = (half4)(0.0h);
|
||||
half4 r1 = (half4)(0.0h);
|
||||
half4 r2 = (half4)(0.0h);
|
||||
half4 r3 = (half4)(0.0h);
|
||||
half4 r4 = (half4)(0.0h);
|
||||
half4 r5 = (half4)(0.0h);
|
||||
half4 r6 = (half4)(0.0h);
|
||||
half4 r7 = (half4)(0.0h);
|
||||
|
||||
int f_offset = Z*kpack*32;
|
||||
int subgroup_id = (int)(0x1F & qcom_get_physical_sub_group_id());
|
||||
subgroup_id = subgroup_id % 12;
|
||||
const int c_offset = subgroup_id*32;
|
||||
__constant half16 * weights_cache = (__constant half16 *)&xmem_buffer[c_offset];
|
||||
|
||||
int coord_s = 0;
|
||||
do {
|
||||
const half4 src0 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
|
||||
coord_s++;
|
||||
const half4 src1 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
|
||||
coord_s++;
|
||||
|
||||
qcom_sub_group_constant_load8(xmem_buffer, weights_buffer, c_offset, f_offset >> 1, 32);
|
||||
f_offset += 64;
|
||||
qcom_sub_group_sync(QCOM_CLK_CONST_LOAD_SYNC);
|
||||
|
||||
r0 += src0.x * weights_cache[0].s0123;
|
||||
r0 += src0.y * weights_cache[0].s4567;
|
||||
r0 += src0.z * weights_cache[0].s89ab;
|
||||
r0 += src0.w * weights_cache[0].scdef;
|
||||
r1 += src0.x * weights_cache[1].s0123;
|
||||
r1 += src0.y * weights_cache[1].s4567;
|
||||
r1 += src0.z * weights_cache[1].s89ab;
|
||||
r1 += src0.w * weights_cache[1].scdef;
|
||||
r2 += src0.x * weights_cache[2].s0123;
|
||||
r2 += src0.y * weights_cache[2].s4567;
|
||||
r2 += src0.z * weights_cache[2].s89ab;
|
||||
r2 += src0.w * weights_cache[2].scdef;
|
||||
r3 += src0.x * weights_cache[3].s0123;
|
||||
r3 += src0.y * weights_cache[3].s4567;
|
||||
r3 += src0.z * weights_cache[3].s89ab;
|
||||
r3 += src0.w * weights_cache[3].scdef;
|
||||
r4 += src0.x * weights_cache[4].s0123;
|
||||
r4 += src0.y * weights_cache[4].s4567;
|
||||
r4 += src0.z * weights_cache[4].s89ab;
|
||||
r4 += src0.w * weights_cache[4].scdef;
|
||||
r5 += src0.x * weights_cache[5].s0123;
|
||||
r5 += src0.y * weights_cache[5].s4567;
|
||||
r5 += src0.z * weights_cache[5].s89ab;
|
||||
r5 += src0.w * weights_cache[5].scdef;
|
||||
r6 += src0.x * weights_cache[6].s0123;
|
||||
r6 += src0.y * weights_cache[6].s4567;
|
||||
r6 += src0.z * weights_cache[6].s89ab;
|
||||
r6 += src0.w * weights_cache[6].scdef;
|
||||
r7 += src0.x * weights_cache[7].s0123;
|
||||
r7 += src0.y * weights_cache[7].s4567;
|
||||
r7 += src0.z * weights_cache[7].s89ab;
|
||||
r7 += src0.w * weights_cache[7].scdef;
|
||||
|
||||
r0 += src1.x * weights_cache[8].s0123;
|
||||
r0 += src1.y * weights_cache[8].s4567;
|
||||
r0 += src1.z * weights_cache[8].s89ab;
|
||||
r0 += src1.w * weights_cache[8].scdef;
|
||||
r1 += src1.x * weights_cache[9].s0123;
|
||||
r1 += src1.y * weights_cache[9].s4567;
|
||||
r1 += src1.z * weights_cache[9].s89ab;
|
||||
r1 += src1.w * weights_cache[9].scdef;
|
||||
r2 += src1.x * weights_cache[10].s0123;
|
||||
r2 += src1.y * weights_cache[10].s4567;
|
||||
r2 += src1.z * weights_cache[10].s89ab;
|
||||
r2 += src1.w * weights_cache[10].scdef;
|
||||
r3 += src1.x * weights_cache[11].s0123;
|
||||
r3 += src1.y * weights_cache[11].s4567;
|
||||
r3 += src1.z * weights_cache[11].s89ab;
|
||||
r3 += src1.w * weights_cache[11].scdef;
|
||||
r4 += src1.x * weights_cache[12].s0123;
|
||||
r4 += src1.y * weights_cache[12].s4567;
|
||||
r4 += src1.z * weights_cache[12].s89ab;
|
||||
r4 += src1.w * weights_cache[12].scdef;
|
||||
r5 += src1.x * weights_cache[13].s0123;
|
||||
r5 += src1.y * weights_cache[13].s4567;
|
||||
r5 += src1.z * weights_cache[13].s89ab;
|
||||
r5 += src1.w * weights_cache[13].scdef;
|
||||
r6 += src1.x * weights_cache[14].s0123;
|
||||
r6 += src1.y * weights_cache[14].s4567;
|
||||
r6 += src1.z * weights_cache[14].s89ab;
|
||||
r6 += src1.w * weights_cache[14].scdef;
|
||||
r7 += src1.x * weights_cache[15].s0123;
|
||||
r7 += src1.y * weights_cache[15].s4567;
|
||||
r7 += src1.z * weights_cache[15].s89ab;
|
||||
r7 += src1.w * weights_cache[15].scdef;
|
||||
} while (coord_s < kpack);
|
||||
|
||||
int coord_s_out = Z*8;
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r0); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r1); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r2); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r3); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r4); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r5); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r6); coord_s_out++; }
|
||||
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r7); }
|
||||
}
|
||||
|
||||
__kernel void adreno_xmem_store_dst_f32(
|
||||
__read_only image2d_t dst_img,
|
||||
__global void * dst_void,
|
||||
ulong offset,
|
||||
int M,
|
||||
int N) {
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
const int npack = (M + 3) / 4;
|
||||
|
||||
if (x >= N || y >= npack) {
|
||||
return;
|
||||
}
|
||||
|
||||
__global float * dst = (__global float *)((__global char *)dst_void + offset);
|
||||
const half4 hv = read_imageh(dst_img, smp_zero, (int2)(x, y));
|
||||
const int m = y*4;
|
||||
if (m + 0 < M) dst[x*M + m + 0] = (float)hv.s0;
|
||||
if (m + 1 < M) dst[x*M + m + 1] = (float)hv.s1;
|
||||
if (m + 2 < M) dst[x*M + m + 2] = (float)hv.s2;
|
||||
if (m + 3 < M) dst[x*M + m + 3] = (float)hv.s3;
|
||||
}
|
||||
Loading…
Reference in New Issue