opencl: add opt-in Adreno xmem F16xF32 GEMM for prefill (llama/22755)

* ggml-opencl: add Adreno xmem F16xF32 GEMM for prefill

* ggml-opencl: address Adreno xmem review comments

* ggml-opencl: align xmem gemm kernel naming

---------

Co-authored-by: Your Name <your@email.com>
This commit is contained in:
yzyyzyhhh 2026-05-13 04:10:37 +08:00 committed by Georgi Gerganov
parent e8a7cd314f
commit 1caed1d2ba
3 changed files with 457 additions and 0 deletions

View File

@ -176,6 +176,10 @@ set(GGML_OPENCL_KERNELS
flash_attn_f32
)
if (GGML_OPENCL_USE_ADRENO_KERNELS)
list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8)
endif ()
foreach (K ${GGML_OPENCL_KERNELS})
ggml_opencl_add_kernel(${K})
endforeach()

View File

@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
cl_bool non_uniform_workgroups;
size_t image_max_buffer_size;
size_t image2d_max_width;
size_t image2d_max_height;
cl_context context;
cl_command_queue queue;
@ -420,6 +422,11 @@ struct ggml_backend_opencl_context {
ggml_cl_buffer prealloc_src0;
ggml_cl_buffer prealloc_src1;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
ggml_cl_buffer prealloc_adreno_xmem_const;
bool adreno_xmem_gemm_enabled = false;
#endif
// prealloc buffers for MoE router table preprocess
bool toggle_reorder = false;
ggml_cl_buffer prealloc_post_router;
@ -538,6 +545,10 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_f16_f32;
cl_kernel kernel_mul_mat_f16_f32_l4;
cl_kernel kernel_mul_mat_f16_f32_tiled;
cl_kernel kernel_adreno_xmem_pack_src_f32;
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
cl_kernel kernel_gemm_xmem_f16_f32_os8;
cl_kernel kernel_adreno_xmem_store_dst_f32;
cl_kernel kernel_mul_mm_f16_f32_kqv;
cl_kernel kernel_mul_mm_f16_f32_kq;
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
@ -1554,6 +1565,32 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// gemm_xmem_f16_f32_os8
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_xmem_f16_f32_os8.cl.h"
};
#else
const std::string kernel_src = read_file("gemm_xmem_f16_f32_os8.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_adreno_xmem_pack_src_f32 =
clCreateKernel(prog, "adreno_xmem_pack_src_f32", &err), err));
CL_CHECK((backend_ctx->kernel_adreno_xmem_prepack_weight_f16 =
clCreateKernel(prog, "adreno_xmem_prepack_weight_f16", &err), err));
CL_CHECK((backend_ctx->kernel_gemm_xmem_f16_f32_os8 =
clCreateKernel(prog, "kernel_gemm_xmem_f16_f32_os8", &err), err));
CL_CHECK((backend_ctx->kernel_adreno_xmem_store_dst_f32 =
clCreateKernel(prog, "adreno_xmem_store_dst_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
// mul_mm_f32_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@ -3473,6 +3510,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
@ -3511,6 +3552,16 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
backend_ctx->adreno_xmem_gemm_enabled ?
"enabled (temporary weight prepack)" : "requested but unsupported by this driver");
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
// determine whether to use large buffer for Adreno
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
@ -9920,6 +9971,169 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
}
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32(
const ggml_backend_opencl_context * backend_ctx,
const ggml_tensor * src0,
const ggml_tensor * src1,
const ggml_tensor * dst) {
if (!backend_ctx->adreno_xmem_gemm_enabled) {
return false;
}
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
return false;
}
if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
return false;
}
if (src0->ne[2] != 1 || src0->ne[3] != 1 ||
src1->ne[2] != 1 || src1->ne[3] != 1 ||
dst->ne[2] != 1 || dst->ne[3] != 1) {
return false;
}
const int K = src0->ne[0];
const int M = src0->ne[1];
const int N = src1->ne[1];
if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) {
return false;
}
if (N <= 1 || M < 64 || N < 16 || K < 64) {
return false;
}
if ((K % 8) != 0) {
return false;
}
const int kpack = K / 4;
const int npack = CEIL_DIV(M, 4);
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
static_cast<size_t>(kpack) > backend_ctx->image2d_max_height) {
return false;
}
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
static_cast<size_t>(npack) > backend_ctx->image2d_max_height) {
return false;
}
return true;
}
static void ggml_cl_mul_mat_f16_f32_adreno_xmem(
ggml_backend_t backend,
const ggml_tensor * src0,
const ggml_tensor * src1,
ggml_tensor * dst) {
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
const cl_ulong offset0 = extra0->offset + src0->view_offs;
const cl_ulong offset1 = extra1->offset + src1->view_offs;
const cl_ulong offsetd = extrad->offset + dst->view_offs;
const int K = src0->ne[0];
const int M = src0->ne[1];
const int N = src1->ne[1];
const int kpack = K / 4;
const int npack = CEIL_DIV(M, 4);
const int os = 8;
const size_t xmem_bytes = 6144;
const size_t weight_bytes = static_cast<size_t>(kpack) * static_cast<size_t>(npack) * 4u * sizeof(cl_half4);
backend_ctx->prealloc_adreno_xmem_const.allocate(backend_ctx->context, xmem_bytes);
cl_int err = CL_SUCCESS;
cl_image_format fmt = {};
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc_src = {};
desc_src.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_src.image_width = static_cast<size_t>(N);
desc_src.image_height = static_cast<size_t>(kpack);
cl_mem src_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_src, nullptr, &err);
CL_CHECK(err);
cl_image_desc desc_dst = {};
desc_dst.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_dst.image_width = static_cast<size_t>(N);
desc_dst.image_height = static_cast<size_t>(npack);
cl_mem dst_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_dst, nullptr, &err);
CL_CHECK(err);
cl_mem weights = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, weight_bytes, nullptr, &err);
CL_CHECK(err);
cl_kernel prepack = backend_ctx->kernel_adreno_xmem_prepack_weight_f16;
CL_CHECK(clSetKernelArg(prepack, 0, sizeof(cl_mem), &weights));
CL_CHECK(clSetKernelArg(prepack, 1, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(prepack, 2, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(prepack, 3, sizeof(int), &K));
CL_CHECK(clSetKernelArg(prepack, 4, sizeof(int), &M));
CL_CHECK(clSetKernelArg(prepack, 5, sizeof(int), &kpack));
CL_CHECK(clSetKernelArg(prepack, 6, sizeof(int), &npack));
CL_CHECK(clSetKernelArg(prepack, 7, sizeof(int), &os));
size_t lws = 256;
size_t max_wg = backend_ctx->get_kernel_workgroup_size(prepack);
if (lws > max_wg) {
lws = max_wg;
}
size_t gws = CEIL_DIV(static_cast<size_t>(kpack) * static_cast<size_t>(npack), lws) * lws;
backend_ctx->enqueue_ndrange_kernel(prepack, 1, &gws, &lws, dst);
cl_kernel pack_src = backend_ctx->kernel_adreno_xmem_pack_src_f32;
CL_CHECK(clSetKernelArg(pack_src, 0, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(pack_src, 1, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(pack_src, 2, sizeof(cl_mem), &src_img));
CL_CHECK(clSetKernelArg(pack_src, 3, sizeof(int), &K));
CL_CHECK(clSetKernelArg(pack_src, 4, sizeof(int), &N));
size_t pack_src_lws[2] = { 16, 16 };
size_t pack_src_gws[2] = {
CEIL_DIV(static_cast<size_t>(N), pack_src_lws[0])*pack_src_lws[0],
CEIL_DIV(static_cast<size_t>(kpack), pack_src_lws[1])*pack_src_lws[1]
};
backend_ctx->enqueue_ndrange_kernel(pack_src, 2, pack_src_gws, pack_src_lws, dst);
cl_kernel gemm = backend_ctx->kernel_gemm_xmem_f16_f32_os8;
CL_CHECK(clSetKernelArg(gemm, 0, sizeof(cl_mem), &weights));
CL_CHECK(clSetKernelArg(gemm, 1, sizeof(cl_mem), &backend_ctx->prealloc_adreno_xmem_const.buffer));
CL_CHECK(clSetKernelArg(gemm, 2, sizeof(cl_mem), &src_img));
CL_CHECK(clSetKernelArg(gemm, 3, sizeof(cl_mem), &dst_img));
CL_CHECK(clSetKernelArg(gemm, 4, sizeof(int), &N));
CL_CHECK(clSetKernelArg(gemm, 5, sizeof(int), &npack));
CL_CHECK(clSetKernelArg(gemm, 6, sizeof(int), &kpack));
const size_t z_values = CEIL_DIV(static_cast<size_t>(npack), static_cast<size_t>(os));
size_t gemm_lws[3] = { 64, 1, 1 };
size_t gemm_gws[3] = {
z_values*gemm_lws[0],
CEIL_DIV(static_cast<size_t>(N), gemm_lws[0]),
1
};
backend_ctx->enqueue_ndrange_kernel(gemm, 3, gemm_gws, gemm_lws, dst);
cl_kernel store_dst = backend_ctx->kernel_adreno_xmem_store_dst_f32;
CL_CHECK(clSetKernelArg(store_dst, 0, sizeof(cl_mem), &dst_img));
CL_CHECK(clSetKernelArg(store_dst, 1, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(store_dst, 2, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(store_dst, 3, sizeof(int), &M));
CL_CHECK(clSetKernelArg(store_dst, 4, sizeof(int), &N));
size_t store_lws[2] = { 16, 16 };
size_t store_gws[2] = {
CEIL_DIV(static_cast<size_t>(N), store_lws[0])*store_lws[0],
CEIL_DIV(static_cast<size_t>(npack), store_lws[1])*store_lws[1]
};
backend_ctx->enqueue_ndrange_kernel(store_dst, 2, store_gws, store_lws, dst);
CL_CHECK(clReleaseMemObject(weights));
CL_CHECK(clReleaseMemObject(dst_img));
CL_CHECK(clReleaseMemObject(src_img));
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_TENSOR_BINARY_OP_LOCALS;
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@ -11681,6 +11895,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
return;
}
case GGML_TYPE_F16: {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (ggml_cl_can_use_adreno_xmem_gemm_f16_f32(backend_ctx, src0, src1, dst)) {
ggml_cl_mul_mat_f16_f32_adreno_xmem(backend, src0, src1, dst);
return;
}
#endif
kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
nth0 = 128; // calculated as (BM*BN)/(TM*TN)

View File

@ -0,0 +1,233 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void adreno_xmem_pack_src_f32(
__global const void * src_void,
ulong offset,
__write_only image2d_t src_img,
int K,
int N) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int kpack = K / 4;
if (x >= N || y >= kpack) {
return;
}
__global const float * src = (__global const float *)((__global const char *)src_void + offset);
const int base = x*K + y*4;
const half4 v = (half4)((half)src[base + 0], (half)src[base + 1], (half)src[base + 2], (half)src[base + 3]);
write_imageh(src_img, (int2)(x, y), v);
}
__kernel void adreno_xmem_prepack_weight_f16(
__global half4 * dst,
__global const void * src_void,
ulong offset,
int K,
int M,
int kpack,
int npack,
int os) {
const int linear = get_global_id(0);
const int total = kpack*npack;
if (linear >= total) {
return;
}
__global const half * src = (__global const half *)((__global const char *)src_void + offset);
const int dst_ogroup = linear % os;
const int dst_o_sp_i = linear / os;
const int dst_i = dst_o_sp_i % kpack;
const int dst_o = dst_o_sp_i / kpack;
const int o_slice = dst_o*os + dst_ogroup;
const int k_base = dst_i*4;
half4 w0 = (half4)(0.0h);
half4 w1 = (half4)(0.0h);
half4 w2 = (half4)(0.0h);
half4 w3 = (half4)(0.0h);
const int o0 = o_slice*4 + 0;
const int o1 = o_slice*4 + 1;
const int o2 = o_slice*4 + 2;
const int o3 = o_slice*4 + 3;
if (k_base + 0 < K) {
if (o0 < M) w0.s0 = src[o0*K + k_base + 0];
if (o1 < M) w0.s1 = src[o1*K + k_base + 0];
if (o2 < M) w0.s2 = src[o2*K + k_base + 0];
if (o3 < M) w0.s3 = src[o3*K + k_base + 0];
}
if (k_base + 1 < K) {
if (o0 < M) w1.s0 = src[o0*K + k_base + 1];
if (o1 < M) w1.s1 = src[o1*K + k_base + 1];
if (o2 < M) w1.s2 = src[o2*K + k_base + 1];
if (o3 < M) w1.s3 = src[o3*K + k_base + 1];
}
if (k_base + 2 < K) {
if (o0 < M) w2.s0 = src[o0*K + k_base + 2];
if (o1 < M) w2.s1 = src[o1*K + k_base + 2];
if (o2 < M) w2.s2 = src[o2*K + k_base + 2];
if (o3 < M) w2.s3 = src[o3*K + k_base + 2];
}
if (k_base + 3 < K) {
if (o0 < M) w3.s0 = src[o0*K + k_base + 3];
if (o1 < M) w3.s1 = src[o1*K + k_base + 3];
if (o2 < M) w3.s2 = src[o2*K + k_base + 3];
if (o3 < M) w3.s3 = src[o3*K + k_base + 3];
}
dst[linear*4 + 0] = w0;
dst[linear*4 + 1] = w1;
dst[linear*4 + 2] = w2;
dst[linear*4 + 3] = w3;
}
__attribute__((qcom_max_concurrent_subgroups(12)))
__kernel void kernel_gemm_xmem_f16_f32_os8(
__constant half8 * weights_buffer __attribute__((sub_group_uniform)),
__constant half8 * xmem_buffer __attribute__((max_constant_size((6144)))),
__read_only image2d_t src_img,
__write_only image2d_t dst_img,
int N,
int npack,
int kpack) {
const int X = get_group_id(1)*get_local_size(0) + get_local_id(0);
const int Z = get_group_id(0)*get_local_size(2) + get_local_id(2);
if (X >= N || Z*8 >= npack) {
return;
}
half4 r0 = (half4)(0.0h);
half4 r1 = (half4)(0.0h);
half4 r2 = (half4)(0.0h);
half4 r3 = (half4)(0.0h);
half4 r4 = (half4)(0.0h);
half4 r5 = (half4)(0.0h);
half4 r6 = (half4)(0.0h);
half4 r7 = (half4)(0.0h);
int f_offset = Z*kpack*32;
int subgroup_id = (int)(0x1F & qcom_get_physical_sub_group_id());
subgroup_id = subgroup_id % 12;
const int c_offset = subgroup_id*32;
__constant half16 * weights_cache = (__constant half16 *)&xmem_buffer[c_offset];
int coord_s = 0;
do {
const half4 src0 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
coord_s++;
const half4 src1 = read_imageh(src_img, smp_zero, (int2)(X, coord_s));
coord_s++;
qcom_sub_group_constant_load8(xmem_buffer, weights_buffer, c_offset, f_offset >> 1, 32);
f_offset += 64;
qcom_sub_group_sync(QCOM_CLK_CONST_LOAD_SYNC);
r0 += src0.x * weights_cache[0].s0123;
r0 += src0.y * weights_cache[0].s4567;
r0 += src0.z * weights_cache[0].s89ab;
r0 += src0.w * weights_cache[0].scdef;
r1 += src0.x * weights_cache[1].s0123;
r1 += src0.y * weights_cache[1].s4567;
r1 += src0.z * weights_cache[1].s89ab;
r1 += src0.w * weights_cache[1].scdef;
r2 += src0.x * weights_cache[2].s0123;
r2 += src0.y * weights_cache[2].s4567;
r2 += src0.z * weights_cache[2].s89ab;
r2 += src0.w * weights_cache[2].scdef;
r3 += src0.x * weights_cache[3].s0123;
r3 += src0.y * weights_cache[3].s4567;
r3 += src0.z * weights_cache[3].s89ab;
r3 += src0.w * weights_cache[3].scdef;
r4 += src0.x * weights_cache[4].s0123;
r4 += src0.y * weights_cache[4].s4567;
r4 += src0.z * weights_cache[4].s89ab;
r4 += src0.w * weights_cache[4].scdef;
r5 += src0.x * weights_cache[5].s0123;
r5 += src0.y * weights_cache[5].s4567;
r5 += src0.z * weights_cache[5].s89ab;
r5 += src0.w * weights_cache[5].scdef;
r6 += src0.x * weights_cache[6].s0123;
r6 += src0.y * weights_cache[6].s4567;
r6 += src0.z * weights_cache[6].s89ab;
r6 += src0.w * weights_cache[6].scdef;
r7 += src0.x * weights_cache[7].s0123;
r7 += src0.y * weights_cache[7].s4567;
r7 += src0.z * weights_cache[7].s89ab;
r7 += src0.w * weights_cache[7].scdef;
r0 += src1.x * weights_cache[8].s0123;
r0 += src1.y * weights_cache[8].s4567;
r0 += src1.z * weights_cache[8].s89ab;
r0 += src1.w * weights_cache[8].scdef;
r1 += src1.x * weights_cache[9].s0123;
r1 += src1.y * weights_cache[9].s4567;
r1 += src1.z * weights_cache[9].s89ab;
r1 += src1.w * weights_cache[9].scdef;
r2 += src1.x * weights_cache[10].s0123;
r2 += src1.y * weights_cache[10].s4567;
r2 += src1.z * weights_cache[10].s89ab;
r2 += src1.w * weights_cache[10].scdef;
r3 += src1.x * weights_cache[11].s0123;
r3 += src1.y * weights_cache[11].s4567;
r3 += src1.z * weights_cache[11].s89ab;
r3 += src1.w * weights_cache[11].scdef;
r4 += src1.x * weights_cache[12].s0123;
r4 += src1.y * weights_cache[12].s4567;
r4 += src1.z * weights_cache[12].s89ab;
r4 += src1.w * weights_cache[12].scdef;
r5 += src1.x * weights_cache[13].s0123;
r5 += src1.y * weights_cache[13].s4567;
r5 += src1.z * weights_cache[13].s89ab;
r5 += src1.w * weights_cache[13].scdef;
r6 += src1.x * weights_cache[14].s0123;
r6 += src1.y * weights_cache[14].s4567;
r6 += src1.z * weights_cache[14].s89ab;
r6 += src1.w * weights_cache[14].scdef;
r7 += src1.x * weights_cache[15].s0123;
r7 += src1.y * weights_cache[15].s4567;
r7 += src1.z * weights_cache[15].s89ab;
r7 += src1.w * weights_cache[15].scdef;
} while (coord_s < kpack);
int coord_s_out = Z*8;
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r0); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r1); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r2); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r3); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r4); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r5); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r6); coord_s_out++; }
if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r7); }
}
__kernel void adreno_xmem_store_dst_f32(
__read_only image2d_t dst_img,
__global void * dst_void,
ulong offset,
int M,
int N) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int npack = (M + 3) / 4;
if (x >= N || y >= npack) {
return;
}
__global float * dst = (__global float *)((__global char *)dst_void + offset);
const half4 hv = read_imageh(dst_img, smp_zero, (int2)(x, y));
const int m = y*4;
if (m + 0 < M) dst[x*M + m + 0] = (float)hv.s0;
if (m + 1 < M) dst[x*M + m + 1] = (float)hv.s1;
if (m + 2 < M) dst[x*M + m + 2] = (float)hv.s2;
if (m + 3 < M) dst[x*M + m + 3] = (float)hv.s3;
}