opencl: improve get_rows, cpy, concat and q6_k flat gemv (llama/24160)
* opencl: allow multiple workgroups for large rows * opencl: improve small cpy * opencl: packed concat for small input * opencl: tweak flat q6_K gemv, increase N_DST and remap threads
This commit is contained in:
parent
5a1feed8ca
commit
a87e950a06
|
|
@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
|
||||
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
||||
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
|
||||
cl_kernel kernel_mul_mat_f32_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f16;
|
||||
cl_kernel kernel_mul_mat_f16_f32_1row;
|
||||
|
|
@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32;
|
||||
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
|
||||
cl_kernel kernel_conv_2d_f16;
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
|
|
@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
|||
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
|
@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
|||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
|
@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|||
nth *= 2;
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
|
||||
int nchunks = 1;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
const int chunk_target = nth * 4;
|
||||
nchunks = (ne00 + chunk_target - 1) / chunk_target;
|
||||
nchunks = MAX(1, MIN(nchunks, 64));
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
|
@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|||
|
||||
int nth = MIN(64, ne0);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32;
|
||||
const bool concat_pack = (dim == 0 && ne0 < 32);
|
||||
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
|
||||
: backend_ctx->kernel_concat_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
if (concat_pack) {
|
||||
// packed kernel needs the dst dims to unflatten its 1-D row index.
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
const int base = MIN(64, maxwg);
|
||||
const int tpr = MIN(ne0, base); // threads per row
|
||||
const int rpw = MAX(1, base / tpr); // rows per workgroup
|
||||
const int lsz = tpr * rpw;
|
||||
const int nrows = ne1*ne2*ne3;
|
||||
const int nwg = (nrows + rpw - 1) / rpw;
|
||||
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
|
||||
size_t local_work_size[] = {(size_t)lsz, 1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
|
||||
} else {
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
|
|
@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
nth1 = 2;
|
||||
ndst = 4;
|
||||
ndst = 16;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
|
@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
kernel = backend_ctx->kernel_cpy_f32_f16;
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_cpy_f32_f32;
|
||||
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
|
||||
: backend_ctx->kernel_cpy_f32_f32;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
|
|
@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
|
||||
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
const int base = MIN(64, maxwg);
|
||||
const int tpr = MIN(ne00, base); // threads per row
|
||||
const int rpw = MAX(1, base / tpr); // rows per workgroup
|
||||
const int lsz = tpr * rpw; // <= base <= maxwg
|
||||
const int nrows = ne01*ne02*ne03;
|
||||
const int nwg = (nrows + rpw - 1) / rpw;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
|
||||
size_t local_work_size[] = {(size_t)lsz, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
|
||||
} else {
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
|
|
|||
|
|
@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
|
|||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat_f32_pack(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global const char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int dim,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int lsz = get_local_size(0);
|
||||
int tpr = min(ne0, lsz); // threads per row
|
||||
int rpw = lsz / tpr; // rows per workgroup
|
||||
int lid = get_local_id(0);
|
||||
int row = get_group_id(0)*rpw + lid / tpr;
|
||||
int lane = lid - (lid / tpr) * tpr;
|
||||
|
||||
int nrows = ne1*ne2*ne3;
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i1 = row % ne1;
|
||||
int t = row / ne1;
|
||||
int i2 = t % ne2;
|
||||
int i3 = t / ne2;
|
||||
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
for (int i0 = lane; i0 < ne0; i0 += tpr) {
|
||||
global const float * x;
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
}
|
||||
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
|
|||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f32_pack(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int lsz = get_local_size(0);
|
||||
int tpr = min(ne00, lsz); // threads per row
|
||||
int rpw = lsz / tpr; // rows per workgroup
|
||||
int lid = get_local_id(0);
|
||||
int row = get_group_id(0)*rpw + lid / tpr;
|
||||
int lane = lid - (lid / tpr) * tpr;
|
||||
|
||||
int nrows = ne01*ne02*ne03;
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i01 = row % ne01;
|
||||
int t = row / ne01;
|
||||
int i02 = t % ne02;
|
||||
int i03 = t / ne02;
|
||||
|
||||
// linear index of the first element of this row, unflattened over dst dims
|
||||
long n = (long)row * ne00;
|
||||
int i3 = (int)(n / ((long)ne2*ne1*ne0));
|
||||
long rm = n - (long)i3*ne2*ne1*ne0;
|
||||
int i2 = (int)(rm / ((long)ne1*ne0));
|
||||
rm -= (long)i2*ne1*ne0;
|
||||
int i1 = (int)(rm / ne0);
|
||||
int i0 = (int)(rm - (long)i1*ne0);
|
||||
|
||||
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = lane; i00 < ne00; i00 += tpr) {
|
||||
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_i32_i32(
|
||||
global int * src0,
|
||||
ulong offset0,
|
||||
|
|
|
|||
|
|
@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
|
|||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i10 = get_group_id(0);
|
||||
int i11 = get_group_id(1);
|
||||
int i12 = get_group_id(2);
|
||||
int nchunks = get_num_groups(0) / ne10;
|
||||
int g = get_group_id(0);
|
||||
int i10 = g / nchunks;
|
||||
int chunk = g - i10 * nchunks;
|
||||
int i11 = get_group_id(1);
|
||||
int i12 = get_group_id(2);
|
||||
|
||||
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
int i02 = i11;
|
||||
int i03 = i12;
|
||||
|
||||
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
|
||||
if (ind >= ne00) {
|
||||
return;
|
||||
}
|
||||
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
|
||||
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
|
||||
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
|
||||
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
int span = (ne00 + nchunks - 1) / nchunks;
|
||||
int start = chunk * span;
|
||||
int end = min(start + span, ne00);
|
||||
|
||||
for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
|
||||
dst_row[ind] = src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
|
|||
global uchar * blk_qh,
|
||||
global char * blk_scales,
|
||||
global half * blk_d,
|
||||
global float * yy,
|
||||
int ib,
|
||||
int ip,
|
||||
int is,
|
||||
int l0
|
||||
int l0,
|
||||
float4 y0,
|
||||
float4 y1,
|
||||
float4 y2,
|
||||
float4 y3
|
||||
) {
|
||||
int y_offset = 128*ip + l0;
|
||||
int q_offset_l = 64*ip + l0;
|
||||
int q_offset_h = 32*ip + l0;
|
||||
|
||||
|
|
@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
|
|||
global uchar * qh = blk_qh + ib*64 + q_offset_h;
|
||||
global char * sc = blk_scales + ib*16 + is;
|
||||
|
||||
global float * y = yy + ib * QK_K + y_offset;
|
||||
|
||||
float dall = blk_d[ib];
|
||||
|
||||
float sumf = 0;
|
||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||
// Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
|
||||
// q_offset_l/h are 4-aligned, so these are aligned vector loads.
|
||||
uchar4 q1v = vload4(0, q1);
|
||||
uchar4 q2v = vload4(0, q2);
|
||||
uchar4 qhv = vload4(0, qh);
|
||||
|
||||
sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
int4 q1i = convert_int4(q1v);
|
||||
int4 q2i = convert_int4(q2v);
|
||||
int4 qhi = convert_int4(qhv);
|
||||
|
||||
sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
// Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
|
||||
// matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
|
||||
// against the cached activation lanes.
|
||||
float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
|
||||
float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
|
||||
float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f;
|
||||
float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;
|
||||
|
||||
sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
|
||||
sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
|
||||
sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
|
||||
sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
|
||||
sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
|
||||
|
||||
sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
|
||||
|
||||
return sumf;
|
||||
return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
|
||||
dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
|
||||
}
|
||||
|
||||
#undef N_DST
|
||||
|
|
@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
|
|||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_DST 16
|
||||
#define N_SIMDGROUP 2
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
|
@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
|
|||
global half * blk_d = (global half *) src0_d + offset_src0_d;
|
||||
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
|
||||
int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
|
||||
int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
|
||||
int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
|
||||
int ip = tid/8; // first or second half of (super) block (0 or 1)
|
||||
int il = tid%8; // each half has 8 parts, one per scale
|
||||
int n = 4; // 4 scales at a time (and 4 sums)
|
||||
int l0 = n*il; // offset into half-block, 0..28
|
||||
int is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
float4 sumf = 0;
|
||||
float sumf[N_DST];
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sumf[row] = 0.f;
|
||||
}
|
||||
|
||||
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
|
||||
if (first_row + 0 < ne01) {
|
||||
sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
|
||||
global float * y = yy + ib * QK_K + 128*ip + l0;
|
||||
float4 y0 = vload4(0, y + 0);
|
||||
float4 y1 = vload4(0, y + 32);
|
||||
float4 y2 = vload4(0, y + 64);
|
||||
float4 y3 = vload4(0, y + 96);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
if (first_row + row < ne01) {
|
||||
sumf[row] += block_q_6_K_dot_y_flat(
|
||||
blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
|
||||
ib, ip, is, l0, y0, y1, y2, y3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float4 tot = (float4)(
|
||||
sub_group_reduce_add(sumf.s0),
|
||||
sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2),
|
||||
sub_group_reduce_add(sumf.s3)
|
||||
);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
float tot = sub_group_reduce_add(sumf[row]);
|
||||
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue