opencl: optimize mul_mat_f16_f32_l4 for decode (llama/24504)
This commit is contained in:
parent
5fa14e9931
commit
9f785839a3
|
|
@ -564,6 +564,9 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_mul_mat_f16_f32_1row;
|
||||
cl_kernel kernel_mul_mat_f16_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4_dr;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq;
|
||||
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
||||
cl_kernel kernel_adreno_xmem_pack_src_f32;
|
||||
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
|
||||
|
|
@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
|||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err));
|
||||
if (backend_ctx->gpu_family == ADRENO) {
|
||||
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err));
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
}
|
||||
|
||||
if (src1t == GGML_TYPE_F32) {
|
||||
// heuristic for packing more work for Adreno
|
||||
const bool adreno_use_lane_split =
|
||||
backend_ctx->gpu_family == ADRENO &&
|
||||
ne11 == 1 &&
|
||||
ne01 >= 8 &&
|
||||
ne00 % 4 == 0 &&
|
||||
r3 == 1 && r2 >= 1 && r2 <= 8 &&
|
||||
(ne12 % r2) == 0;
|
||||
|
||||
if (ne11 * ne12 < 4) {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
|
||||
} else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq;
|
||||
nrows = 1;
|
||||
} else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls;
|
||||
nrows = 1;
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
|
||||
nrows = ne11;
|
||||
if (ne11 == 1) {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr;
|
||||
nrows = 1; // not used by this kernel
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
|
||||
nrows = ne11;
|
||||
}
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mul_mat_f16_f32;
|
||||
nrows = 4;
|
||||
|
|
@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
} else {
|
||||
int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) {
|
||||
const int NDST_DR = 4;
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) {
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) {
|
||||
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
} else {
|
||||
int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Each subgroup produces DR_NDST outputs, assumes ne11 == 1
|
||||
#define MUL_MAT_F16_F32_L4_DR_NDST 4
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f16_f32_l4_dr(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST;
|
||||
const int im = get_group_id(2);
|
||||
|
||||
const int i12 = im % ne12;
|
||||
const int i13 = im / ne12;
|
||||
|
||||
// assume ne11 == 1
|
||||
const ulong offset_src1 = i12*nb12 + i13*nb13;
|
||||
global float4 * y4 = (global float4 *)(src1 + offset_src1);
|
||||
|
||||
global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST];
|
||||
float sumf[MUL_MAT_F16_F32_L4_DR_NDST];
|
||||
|
||||
const ulong k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
|
||||
int r0 = r0_base + n;
|
||||
int r0c = r0 < ne01 ? r0 : 0;
|
||||
ulong off = (ulong)r0c*nb01 + k_head_off;
|
||||
x4[n] = (global half4 *)(src0 + off);
|
||||
sumf[n] = 0.0f;
|
||||
}
|
||||
|
||||
const int n_chunks = ne00 / 4;
|
||||
const int sg_size = get_max_sub_group_size();
|
||||
const int lid = get_sub_group_local_id();
|
||||
|
||||
for (int i = lid; i < n_chunks; i += sg_size) {
|
||||
float4 q = y4[i];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
|
||||
float4 k = convert_float4(x4[n][i]);
|
||||
sumf[n] = mad(k.s0, q.s0, sumf[n]);
|
||||
sumf[n] = mad(k.s1, q.s1, sumf[n]);
|
||||
sumf[n] = mad(k.s2, q.s2, sumf[n]);
|
||||
sumf[n] = mad(k.s3, q.s3, sumf[n]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
|
||||
float reduced = sub_group_reduce_add(sumf[n]);
|
||||
int r0 = r0_base + n;
|
||||
if (lid == 0 && r0 < ne01) {
|
||||
dst[im*ne1*ne0 + r0] = reduced;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Kernels for decoding, Adreno only for now
|
||||
#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
|
||||
#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f)
|
||||
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
kernel void kernel_mul_mat_f16_f32_l4_dr_ls(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
const int r0_base = get_group_id(0) * 2;
|
||||
const int kv_grp = get_group_id(2); // KV head group; im = kv_grp*r2 + q
|
||||
|
||||
const int i12_kv = kv_grp % ne02;
|
||||
const int i13_kv = kv_grp / ne02;
|
||||
|
||||
const int lid = get_sub_group_local_id();
|
||||
const int subhalf = lid >> 5; // 0 or 1 (which K row in the WG)
|
||||
const int intra = lid & 31; // 0..31 (lane within the half)
|
||||
|
||||
const int r0 = r0_base + subhalf;
|
||||
const int r0c = r0 < ne01 ? r0 : 0; // clamp OOB to row 0; skip write below
|
||||
|
||||
// K row pointer for this lane (one K row per half-wave).
|
||||
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
|
||||
global half4 * x4 = (global half4 *)(src0 + k_off);
|
||||
|
||||
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
const int i12_q = i12_kv*r2 + q;
|
||||
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
|
||||
y4[q] = (global float4 *)(src1 + q_off);
|
||||
}
|
||||
|
||||
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
partial[q] = 0.0f;
|
||||
}
|
||||
|
||||
const int n_chunks = ne00 / 4;
|
||||
|
||||
for (int i = intra; i < n_chunks; i += 32) {
|
||||
float4 k = convert_float4(x4[i]);
|
||||
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
float4 v = y4[q][i];
|
||||
partial[q] = mad(k.s0, v.s0, partial[q]);
|
||||
partial[q] = mad(k.s1, v.s1, partial[q]);
|
||||
partial[q] = mad(k.s2, v.s2, partial[q]);
|
||||
partial[q] = mad(k.s3, v.s3, partial[q]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// half-wave reduction
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 16u);
|
||||
}
|
||||
}
|
||||
|
||||
if (intra == 0 && r0 < ne01) {
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
const int im = i12_kv*r2 + q + i13_kv*ne12;
|
||||
dst[im*ne1*ne0 + r0] = partial[q];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
kernel void kernel_mul_mat_f16_f32_l4_dr_lq(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
const int r0_base = get_group_id(0) * 4;
|
||||
const int kv_grp = get_group_id(2);
|
||||
|
||||
const int i12_kv = kv_grp % ne02;
|
||||
const int i13_kv = kv_grp / ne02;
|
||||
|
||||
const int lid = get_sub_group_local_id();
|
||||
const int subq = lid >> 4; // 0..3 (which K row)
|
||||
const int intra = lid & 15; // 0..15 (lane within quarter)
|
||||
|
||||
const int r0 = r0_base + subq;
|
||||
const int r0c = r0 < ne01 ? r0 : 0;
|
||||
|
||||
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
|
||||
global half4 * x4 = (global half4 *)(src0 + k_off);
|
||||
|
||||
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
const int i12_q = i12_kv*r2 + q;
|
||||
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
|
||||
y4[q] = (global float4 *)(src1 + q_off);
|
||||
}
|
||||
|
||||
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
partial[q] = 0.0f;
|
||||
}
|
||||
|
||||
const int n_chunks = ne00 / 4;
|
||||
|
||||
for (int i = intra; i < n_chunks; i += 16) {
|
||||
float4 k = convert_float4(x4[i]);
|
||||
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
float4 v = y4[q][i];
|
||||
partial[q] = mad(k.s0, v.s0, partial[q]);
|
||||
partial[q] = mad(k.s1, v.s1, partial[q]);
|
||||
partial[q] = mad(k.s2, v.s2, partial[q]);
|
||||
partial[q] = mad(k.s3, v.s3, partial[q]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// quarter-wave reduction
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
|
||||
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
|
||||
}
|
||||
}
|
||||
|
||||
if (intra == 0 && r0 < ne01) {
|
||||
#pragma unroll
|
||||
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
|
||||
if (q < r2) {
|
||||
const int im = i12_kv*r2 + q + i13_kv*ne12;
|
||||
dst[im*ne1*ne0 + r0] = partial[q];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ADRENO_GPU
|
||||
|
|
|
|||
Loading…
Reference in New Issue