From 9f785839a306dfa83cccca49a709a4f95acd8d95 Mon Sep 17 00:00:00 2001 From: lhez Date: Tue, 16 Jun 2026 23:21:26 -0700 Subject: [PATCH] opencl: optimize mul_mat_f16_f32_l4 for decode (llama/24504) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 58 +++- .../ggml-opencl/kernels/mul_mv_f16_f32_l4.cl | 296 ++++++++++++++++++ 2 files changed, 348 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index ca2002424..5ad8d76fa 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -564,6 +564,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_f16_f32_1row; cl_kernel kernel_mul_mat_f16_f32; cl_kernel kernel_mul_mat_f16_f32_l4; + cl_kernel kernel_mul_mat_f16_f32_l4_dr; + cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls; + cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq; cl_kernel kernel_mul_mat_f16_f32_tiled; cl_kernel kernel_adreno_xmem_pack_src_f32; cl_kernel kernel_adreno_xmem_prepack_weight_f16; @@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err)); + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err)); + if (backend_ctx->gpu_family == ADRENO) { + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err)); + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err)); + } GGML_LOG_CONT("."); } @@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } if (src1t == GGML_TYPE_F32) { + // heuristic for packing more work for Adreno + const bool adreno_use_lane_split = + backend_ctx->gpu_family == ADRENO && + ne11 == 1 && + ne01 >= 8 && + ne00 % 4 == 0 && + r3 == 1 && r2 >= 1 && r2 <= 8 && + (ne12 % r2) == 0; + if (ne11 * ne12 < 4) { kernel = backend_ctx->kernel_mul_mat_f16_f32_1row; + } else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq; + nrows = 1; + } else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls; + nrows = 1; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; - nrows = ne11; + if (ne11 == 1) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr; + nrows = 1; // not used by this kernel + } else { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; + nrows = ne11; + } } else { kernel = backend_ctx->kernel_mul_mat_f16_f32; nrows = 4; @@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { - int64_t ny = (ne11 + nrows - 1)/nrows; + if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) { + const int NDST_DR = 4; + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; - size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) { + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) { + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else { + int64_t ny = (ne11 + nrows - 1)/nrows; + + size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } } } diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl index cdf8197c4..a639ec664 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl @@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4( } } } + +// Each subgroup produces DR_NDST outputs, assumes ne11 == 1 +#define MUL_MAT_F16_F32_L4_DR_NDST 4 + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mat_f16_f32_l4_dr( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST; + const int im = get_group_id(2); + + const int i12 = im % ne12; + const int i13 = im / ne12; + + // assume ne11 == 1 + const ulong offset_src1 = i12*nb12 + i13*nb13; + global float4 * y4 = (global float4 *)(src1 + offset_src1); + + global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST]; + float sumf[MUL_MAT_F16_F32_L4_DR_NDST]; + + const ulong k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03; + + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + int r0 = r0_base + n; + int r0c = r0 < ne01 ? r0 : 0; + ulong off = (ulong)r0c*nb01 + k_head_off; + x4[n] = (global half4 *)(src0 + off); + sumf[n] = 0.0f; + } + + const int n_chunks = ne00 / 4; + const int sg_size = get_max_sub_group_size(); + const int lid = get_sub_group_local_id(); + + for (int i = lid; i < n_chunks; i += sg_size) { + float4 q = y4[i]; + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + float4 k = convert_float4(x4[n][i]); + sumf[n] = mad(k.s0, q.s0, sumf[n]); + sumf[n] = mad(k.s1, q.s1, sumf[n]); + sumf[n] = mad(k.s2, q.s2, sumf[n]); + sumf[n] = mad(k.s3, q.s3, sumf[n]); + } + } + + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + float reduced = sub_group_reduce_add(sumf[n]); + int r0 = r0_base + n; + if (lid == 0 && r0 < ne01) { + dst[im*ne1*ne0 + r0] = reduced; + } + } +} + +// Kernels for decoding, Adreno only for now +#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8 + +#ifdef ADRENO_GPU +#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable +#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f) + +REQD_SUBGROUP_SIZE_64 +kernel void kernel_mul_mat_f16_f32_l4_dr_ls( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * 2; + const int kv_grp = get_group_id(2); // KV head group; im = kv_grp*r2 + q + + const int i12_kv = kv_grp % ne02; + const int i13_kv = kv_grp / ne02; + + const int lid = get_sub_group_local_id(); + const int subhalf = lid >> 5; // 0 or 1 (which K row in the WG) + const int intra = lid & 31; // 0..31 (lane within the half) + + const int r0 = r0_base + subhalf; + const int r0c = r0 < ne01 ? r0 : 0; // clamp OOB to row 0; skip write below + + // K row pointer for this lane (one K row per half-wave). + const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03; + global half4 * x4 = (global half4 *)(src0 + k_off); + + global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + const int i12_q = i12_kv*r2 + q; + const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13; + y4[q] = (global float4 *)(src1 + q_off); + } + + float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + partial[q] = 0.0f; + } + + const int n_chunks = ne00 / 4; + + for (int i = intra; i < n_chunks; i += 32) { + float4 k = convert_float4(x4[i]); + + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + float4 v = y4[q][i]; + partial[q] = mad(k.s0, v.s0, partial[q]); + partial[q] = mad(k.s1, v.s1, partial[q]); + partial[q] = mad(k.s2, v.s2, partial[q]); + partial[q] = mad(k.s3, v.s3, partial[q]); + } + } + } + + // half-wave reduction + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + partial[q] += sub_group_shuffle_xor(partial[q], 1u); + partial[q] += sub_group_shuffle_xor(partial[q], 2u); + partial[q] += sub_group_shuffle_xor(partial[q], 4u); + partial[q] += sub_group_shuffle_xor(partial[q], 8u); + partial[q] += sub_group_shuffle_xor(partial[q], 16u); + } + } + + if (intra == 0 && r0 < ne01) { + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + const int im = i12_kv*r2 + q + i13_kv*ne12; + dst[im*ne1*ne0 + r0] = partial[q]; + } + } + } +} + +REQD_SUBGROUP_SIZE_64 +kernel void kernel_mul_mat_f16_f32_l4_dr_lq( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * 4; + const int kv_grp = get_group_id(2); + + const int i12_kv = kv_grp % ne02; + const int i13_kv = kv_grp / ne02; + + const int lid = get_sub_group_local_id(); + const int subq = lid >> 4; // 0..3 (which K row) + const int intra = lid & 15; // 0..15 (lane within quarter) + + const int r0 = r0_base + subq; + const int r0c = r0 < ne01 ? r0 : 0; + + const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03; + global half4 * x4 = (global half4 *)(src0 + k_off); + + global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + const int i12_q = i12_kv*r2 + q; + const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13; + y4[q] = (global float4 *)(src1 + q_off); + } + + float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + partial[q] = 0.0f; + } + + const int n_chunks = ne00 / 4; + + for (int i = intra; i < n_chunks; i += 16) { + float4 k = convert_float4(x4[i]); + + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + float4 v = y4[q][i]; + partial[q] = mad(k.s0, v.s0, partial[q]); + partial[q] = mad(k.s1, v.s1, partial[q]); + partial[q] = mad(k.s2, v.s2, partial[q]); + partial[q] = mad(k.s3, v.s3, partial[q]); + } + } + } + + // quarter-wave reduction + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + partial[q] += sub_group_shuffle_xor(partial[q], 1u); + partial[q] += sub_group_shuffle_xor(partial[q], 2u); + partial[q] += sub_group_shuffle_xor(partial[q], 4u); + partial[q] += sub_group_shuffle_xor(partial[q], 8u); + } + } + + if (intra == 0 && r0 < ne01) { + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + const int im = i12_kv*r2 + q + i13_kv*ne12; + dst[im*ne1*ne0 + r0] = partial[q]; + } + } + } +} +#endif // ADRENO_GPU