opencl: optimize mul_mat_f16_f32_l4 for decode (llama/24504)

This commit is contained in:
lhez 2026-06-16 23:21:26 -07:00 committed by Georgi Gerganov
parent 5fa14e9931
commit 9f785839a3
2 changed files with 348 additions and 6 deletions

View File

@ -564,6 +564,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_f16_f32_1row;
cl_kernel kernel_mul_mat_f16_f32;
cl_kernel kernel_mul_mat_f16_f32_l4;
cl_kernel kernel_mul_mat_f16_f32_l4_dr;
cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls;
cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq;
cl_kernel kernel_mul_mat_f16_f32_tiled;
cl_kernel kernel_adreno_xmem_pack_src_f32;
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err));
if (backend_ctx->gpu_family == ADRENO) {
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err));
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err));
}
GGML_LOG_CONT(".");
}
@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
if (src1t == GGML_TYPE_F32) {
// heuristic for packing more work for Adreno
const bool adreno_use_lane_split =
backend_ctx->gpu_family == ADRENO &&
ne11 == 1 &&
ne01 >= 8 &&
ne00 % 4 == 0 &&
r3 == 1 && r2 >= 1 && r2 <= 8 &&
(ne12 % r2) == 0;
if (ne11 * ne12 < 4) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
} else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq;
nrows = 1;
} else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls;
nrows = 1;
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
nrows = ne11;
if (ne11 == 1) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr;
nrows = 1; // not used by this kernel
} else {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
nrows = ne11;
}
} else {
kernel = backend_ctx->kernel_mul_mat_f16_f32;
nrows = 4;
@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else {
int64_t ny = (ne11 + nrows - 1)/nrows;
if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) {
const int NDST_DR = 4;
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) {
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) {
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else {
int64_t ny = (ne11 + nrows - 1)/nrows;
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
}

View File

@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4(
}
}
}
// Each subgroup produces DR_NDST outputs, assumes ne11 == 1
#define MUL_MAT_F16_F32_L4_DR_NDST 4
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_f16_f32_l4_dr(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST;
const int im = get_group_id(2);
const int i12 = im % ne12;
const int i13 = im / ne12;
// assume ne11 == 1
const ulong offset_src1 = i12*nb12 + i13*nb13;
global float4 * y4 = (global float4 *)(src1 + offset_src1);
global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST];
float sumf[MUL_MAT_F16_F32_L4_DR_NDST];
const ulong k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03;
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
int r0 = r0_base + n;
int r0c = r0 < ne01 ? r0 : 0;
ulong off = (ulong)r0c*nb01 + k_head_off;
x4[n] = (global half4 *)(src0 + off);
sumf[n] = 0.0f;
}
const int n_chunks = ne00 / 4;
const int sg_size = get_max_sub_group_size();
const int lid = get_sub_group_local_id();
for (int i = lid; i < n_chunks; i += sg_size) {
float4 q = y4[i];
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
float4 k = convert_float4(x4[n][i]);
sumf[n] = mad(k.s0, q.s0, sumf[n]);
sumf[n] = mad(k.s1, q.s1, sumf[n]);
sumf[n] = mad(k.s2, q.s2, sumf[n]);
sumf[n] = mad(k.s3, q.s3, sumf[n]);
}
}
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
float reduced = sub_group_reduce_add(sumf[n]);
int r0 = r0_base + n;
if (lid == 0 && r0 < ne01) {
dst[im*ne1*ne0 + r0] = reduced;
}
}
}
// Kernels for decoding, Adreno only for now
#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8
#ifdef ADRENO_GPU
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f)
REQD_SUBGROUP_SIZE_64
kernel void kernel_mul_mat_f16_f32_l4_dr_ls(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * 2;
const int kv_grp = get_group_id(2); // KV head group; im = kv_grp*r2 + q
const int i12_kv = kv_grp % ne02;
const int i13_kv = kv_grp / ne02;
const int lid = get_sub_group_local_id();
const int subhalf = lid >> 5; // 0 or 1 (which K row in the WG)
const int intra = lid & 31; // 0..31 (lane within the half)
const int r0 = r0_base + subhalf;
const int r0c = r0 < ne01 ? r0 : 0; // clamp OOB to row 0; skip write below
// K row pointer for this lane (one K row per half-wave).
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
global half4 * x4 = (global half4 *)(src0 + k_off);
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
const int i12_q = i12_kv*r2 + q;
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
y4[q] = (global float4 *)(src1 + q_off);
}
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
partial[q] = 0.0f;
}
const int n_chunks = ne00 / 4;
for (int i = intra; i < n_chunks; i += 32) {
float4 k = convert_float4(x4[i]);
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
float4 v = y4[q][i];
partial[q] = mad(k.s0, v.s0, partial[q]);
partial[q] = mad(k.s1, v.s1, partial[q]);
partial[q] = mad(k.s2, v.s2, partial[q]);
partial[q] = mad(k.s3, v.s3, partial[q]);
}
}
}
// half-wave reduction
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
partial[q] += sub_group_shuffle_xor(partial[q], 16u);
}
}
if (intra == 0 && r0 < ne01) {
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
const int im = i12_kv*r2 + q + i13_kv*ne12;
dst[im*ne1*ne0 + r0] = partial[q];
}
}
}
}
REQD_SUBGROUP_SIZE_64
kernel void kernel_mul_mat_f16_f32_l4_dr_lq(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * 4;
const int kv_grp = get_group_id(2);
const int i12_kv = kv_grp % ne02;
const int i13_kv = kv_grp / ne02;
const int lid = get_sub_group_local_id();
const int subq = lid >> 4; // 0..3 (which K row)
const int intra = lid & 15; // 0..15 (lane within quarter)
const int r0 = r0_base + subq;
const int r0c = r0 < ne01 ? r0 : 0;
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
global half4 * x4 = (global half4 *)(src0 + k_off);
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
const int i12_q = i12_kv*r2 + q;
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
y4[q] = (global float4 *)(src1 + q_off);
}
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
partial[q] = 0.0f;
}
const int n_chunks = ne00 / 4;
for (int i = intra; i < n_chunks; i += 16) {
float4 k = convert_float4(x4[i]);
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
float4 v = y4[q][i];
partial[q] = mad(k.s0, v.s0, partial[q]);
partial[q] = mad(k.s1, v.s1, partial[q]);
partial[q] = mad(k.s2, v.s2, partial[q]);
partial[q] = mad(k.s3, v.s3, partial[q]);
}
}
}
// quarter-wave reduction
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
}
}
if (intra == 0 && r0 < ne01) {
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
const int im = i12_kv*r2 + q + i13_kv*ne12;
dst[im*ne1*ne0 + r0] = partial[q];
}
}
}
}
#endif // ADRENO_GPU