diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 7edb3eb4e..0b39c0113 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -176,6 +176,10 @@ set(GGML_OPENCL_KERNELS flash_attn_f32 ) +if (GGML_OPENCL_USE_ADRENO_KERNELS) + list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8) +endif () + foreach (K ${GGML_OPENCL_KERNELS}) ggml_opencl_add_kernel(${K}) endforeach() diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 73a58f74a..61bdc62cd 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -407,6 +407,8 @@ struct ggml_backend_opencl_context { cl_bool non_uniform_workgroups; size_t image_max_buffer_size; + size_t image2d_max_width; + size_t image2d_max_height; cl_context context; cl_command_queue queue; @@ -420,6 +422,11 @@ struct ggml_backend_opencl_context { ggml_cl_buffer prealloc_src0; ggml_cl_buffer prealloc_src1; +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + ggml_cl_buffer prealloc_adreno_xmem_const; + bool adreno_xmem_gemm_enabled = false; +#endif + // prealloc buffers for MoE router table preprocess bool toggle_reorder = false; ggml_cl_buffer prealloc_post_router; @@ -538,6 +545,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_f16_f32; cl_kernel kernel_mul_mat_f16_f32_l4; cl_kernel kernel_mul_mat_f16_f32_tiled; + cl_kernel kernel_adreno_xmem_pack_src_f32; + cl_kernel kernel_adreno_xmem_prepack_weight_f16; + cl_kernel kernel_gemm_xmem_f16_f32_os8; + cl_kernel kernel_adreno_xmem_store_dst_f32; cl_kernel kernel_mul_mm_f16_f32_kqv; cl_kernel kernel_mul_mm_f16_f32_kq; cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; @@ -1554,6 +1565,32 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // gemm_xmem_f16_f32_os8 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_xmem_f16_f32_os8.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_xmem_f16_f32_os8.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_adreno_xmem_pack_src_f32 = + clCreateKernel(prog, "adreno_xmem_pack_src_f32", &err), err)); + CL_CHECK((backend_ctx->kernel_adreno_xmem_prepack_weight_f16 = + clCreateKernel(prog, "adreno_xmem_prepack_weight_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_gemm_xmem_f16_f32_os8 = + clCreateKernel(prog, "kernel_gemm_xmem_f16_f32_os8", &err), err)); + CL_CHECK((backend_ctx->kernel_adreno_xmem_store_dst_f32 = + clCreateKernel(prog, "adreno_xmem_store_dst_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // mul_mm_f32_f32_l4_lm { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -3473,6 +3510,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL); GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size); + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL); + GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height); + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL); GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size); @@ -3511,6 +3552,16 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); #endif // GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr && + backend_ctx->gpu_family == GPU_FAMILY::ADRENO; + if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) { + GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n", + backend_ctx->adreno_xmem_gemm_enabled ? + "enabled (temporary weight prepack)" : "requested but unsupported by this driver"); + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // determine whether to use large buffer for Adreno backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr && backend_ctx->gpu_family == GPU_FAMILY::ADRENO; @@ -9920,6 +9971,169 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); } +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS +static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32( + const ggml_backend_opencl_context * backend_ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + const ggml_tensor * dst) { + if (!backend_ctx->adreno_xmem_gemm_enabled) { + return false; + } + if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { + return false; + } + if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) { + return false; + } + if (src0->ne[2] != 1 || src0->ne[3] != 1 || + src1->ne[2] != 1 || src1->ne[3] != 1 || + dst->ne[2] != 1 || dst->ne[3] != 1) { + return false; + } + const int K = src0->ne[0]; + const int M = src0->ne[1]; + const int N = src1->ne[1]; + if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) { + return false; + } + if (N <= 1 || M < 64 || N < 16 || K < 64) { + return false; + } + if ((K % 8) != 0) { + return false; + } + const int kpack = K / 4; + const int npack = CEIL_DIV(M, 4); + if (static_cast(N) > backend_ctx->image2d_max_width || + static_cast(kpack) > backend_ctx->image2d_max_height) { + return false; + } + if (static_cast(N) > backend_ctx->image2d_max_width || + static_cast(npack) > backend_ctx->image2d_max_height) { + return false; + } + return true; +} + +static void ggml_cl_mul_mat_f16_f32_adreno_xmem( + ggml_backend_t backend, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + const cl_ulong offset0 = extra0->offset + src0->view_offs; + const cl_ulong offset1 = extra1->offset + src1->view_offs; + const cl_ulong offsetd = extrad->offset + dst->view_offs; + + const int K = src0->ne[0]; + const int M = src0->ne[1]; + const int N = src1->ne[1]; + const int kpack = K / 4; + const int npack = CEIL_DIV(M, 4); + const int os = 8; + + const size_t xmem_bytes = 6144; + const size_t weight_bytes = static_cast(kpack) * static_cast(npack) * 4u * sizeof(cl_half4); + + backend_ctx->prealloc_adreno_xmem_const.allocate(backend_ctx->context, xmem_bytes); + + cl_int err = CL_SUCCESS; + cl_image_format fmt = {}; + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_HALF_FLOAT; + + cl_image_desc desc_src = {}; + desc_src.image_type = CL_MEM_OBJECT_IMAGE2D; + desc_src.image_width = static_cast(N); + desc_src.image_height = static_cast(kpack); + cl_mem src_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_src, nullptr, &err); + CL_CHECK(err); + + cl_image_desc desc_dst = {}; + desc_dst.image_type = CL_MEM_OBJECT_IMAGE2D; + desc_dst.image_width = static_cast(N); + desc_dst.image_height = static_cast(npack); + cl_mem dst_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_dst, nullptr, &err); + CL_CHECK(err); + + cl_mem weights = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, weight_bytes, nullptr, &err); + CL_CHECK(err); + + cl_kernel prepack = backend_ctx->kernel_adreno_xmem_prepack_weight_f16; + CL_CHECK(clSetKernelArg(prepack, 0, sizeof(cl_mem), &weights)); + CL_CHECK(clSetKernelArg(prepack, 1, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(prepack, 2, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(prepack, 3, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(prepack, 4, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(prepack, 5, sizeof(int), &kpack)); + CL_CHECK(clSetKernelArg(prepack, 6, sizeof(int), &npack)); + CL_CHECK(clSetKernelArg(prepack, 7, sizeof(int), &os)); + size_t lws = 256; + size_t max_wg = backend_ctx->get_kernel_workgroup_size(prepack); + if (lws > max_wg) { + lws = max_wg; + } + size_t gws = CEIL_DIV(static_cast(kpack) * static_cast(npack), lws) * lws; + backend_ctx->enqueue_ndrange_kernel(prepack, 1, &gws, &lws, dst); + + cl_kernel pack_src = backend_ctx->kernel_adreno_xmem_pack_src_f32; + CL_CHECK(clSetKernelArg(pack_src, 0, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(pack_src, 1, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(pack_src, 2, sizeof(cl_mem), &src_img)); + CL_CHECK(clSetKernelArg(pack_src, 3, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(pack_src, 4, sizeof(int), &N)); + size_t pack_src_lws[2] = { 16, 16 }; + size_t pack_src_gws[2] = { + CEIL_DIV(static_cast(N), pack_src_lws[0])*pack_src_lws[0], + CEIL_DIV(static_cast(kpack), pack_src_lws[1])*pack_src_lws[1] + }; + backend_ctx->enqueue_ndrange_kernel(pack_src, 2, pack_src_gws, pack_src_lws, dst); + + cl_kernel gemm = backend_ctx->kernel_gemm_xmem_f16_f32_os8; + CL_CHECK(clSetKernelArg(gemm, 0, sizeof(cl_mem), &weights)); + CL_CHECK(clSetKernelArg(gemm, 1, sizeof(cl_mem), &backend_ctx->prealloc_adreno_xmem_const.buffer)); + CL_CHECK(clSetKernelArg(gemm, 2, sizeof(cl_mem), &src_img)); + CL_CHECK(clSetKernelArg(gemm, 3, sizeof(cl_mem), &dst_img)); + CL_CHECK(clSetKernelArg(gemm, 4, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(gemm, 5, sizeof(int), &npack)); + CL_CHECK(clSetKernelArg(gemm, 6, sizeof(int), &kpack)); + const size_t z_values = CEIL_DIV(static_cast(npack), static_cast(os)); + size_t gemm_lws[3] = { 64, 1, 1 }; + size_t gemm_gws[3] = { + z_values*gemm_lws[0], + CEIL_DIV(static_cast(N), gemm_lws[0]), + 1 + }; + backend_ctx->enqueue_ndrange_kernel(gemm, 3, gemm_gws, gemm_lws, dst); + + cl_kernel store_dst = backend_ctx->kernel_adreno_xmem_store_dst_f32; + CL_CHECK(clSetKernelArg(store_dst, 0, sizeof(cl_mem), &dst_img)); + CL_CHECK(clSetKernelArg(store_dst, 1, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(store_dst, 2, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(store_dst, 3, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(store_dst, 4, sizeof(int), &N)); + size_t store_lws[2] = { 16, 16 }; + size_t store_gws[2] = { + CEIL_DIV(static_cast(N), store_lws[0])*store_lws[0], + CEIL_DIV(static_cast(npack), store_lws[1])*store_lws[1] + }; + backend_ctx->enqueue_ndrange_kernel(store_dst, 2, store_gws, store_lws, dst); + + CL_CHECK(clReleaseMemObject(weights)); + CL_CHECK(clReleaseMemObject(dst_img)); + CL_CHECK(clReleaseMemObject(src_img)); +} +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -11681,6 +11895,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co return; } case GGML_TYPE_F16: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (ggml_cl_can_use_adreno_xmem_gemm_f16_f32(backend_ctx, src0, src1, dst)) { + ggml_cl_mul_mat_f16_f32_adreno_xmem(backend, src0, src1, dst); + return; + } +#endif kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm; nth0 = 128; // calculated as (BM*BN)/(TM*TN) diff --git a/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl b/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl new file mode 100644 index 000000000..df9d9aed0 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl @@ -0,0 +1,233 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load : enable + +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +__kernel void adreno_xmem_pack_src_f32( + __global const void * src_void, + ulong offset, + __write_only image2d_t src_img, + int K, + int N) { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int kpack = K / 4; + + if (x >= N || y >= kpack) { + return; + } + + __global const float * src = (__global const float *)((__global const char *)src_void + offset); + const int base = x*K + y*4; + const half4 v = (half4)((half)src[base + 0], (half)src[base + 1], (half)src[base + 2], (half)src[base + 3]); + write_imageh(src_img, (int2)(x, y), v); +} + +__kernel void adreno_xmem_prepack_weight_f16( + __global half4 * dst, + __global const void * src_void, + ulong offset, + int K, + int M, + int kpack, + int npack, + int os) { + const int linear = get_global_id(0); + const int total = kpack*npack; + if (linear >= total) { + return; + } + + __global const half * src = (__global const half *)((__global const char *)src_void + offset); + + const int dst_ogroup = linear % os; + const int dst_o_sp_i = linear / os; + const int dst_i = dst_o_sp_i % kpack; + const int dst_o = dst_o_sp_i / kpack; + const int o_slice = dst_o*os + dst_ogroup; + const int k_base = dst_i*4; + + half4 w0 = (half4)(0.0h); + half4 w1 = (half4)(0.0h); + half4 w2 = (half4)(0.0h); + half4 w3 = (half4)(0.0h); + + const int o0 = o_slice*4 + 0; + const int o1 = o_slice*4 + 1; + const int o2 = o_slice*4 + 2; + const int o3 = o_slice*4 + 3; + + if (k_base + 0 < K) { + if (o0 < M) w0.s0 = src[o0*K + k_base + 0]; + if (o1 < M) w0.s1 = src[o1*K + k_base + 0]; + if (o2 < M) w0.s2 = src[o2*K + k_base + 0]; + if (o3 < M) w0.s3 = src[o3*K + k_base + 0]; + } + if (k_base + 1 < K) { + if (o0 < M) w1.s0 = src[o0*K + k_base + 1]; + if (o1 < M) w1.s1 = src[o1*K + k_base + 1]; + if (o2 < M) w1.s2 = src[o2*K + k_base + 1]; + if (o3 < M) w1.s3 = src[o3*K + k_base + 1]; + } + if (k_base + 2 < K) { + if (o0 < M) w2.s0 = src[o0*K + k_base + 2]; + if (o1 < M) w2.s1 = src[o1*K + k_base + 2]; + if (o2 < M) w2.s2 = src[o2*K + k_base + 2]; + if (o3 < M) w2.s3 = src[o3*K + k_base + 2]; + } + if (k_base + 3 < K) { + if (o0 < M) w3.s0 = src[o0*K + k_base + 3]; + if (o1 < M) w3.s1 = src[o1*K + k_base + 3]; + if (o2 < M) w3.s2 = src[o2*K + k_base + 3]; + if (o3 < M) w3.s3 = src[o3*K + k_base + 3]; + } + + dst[linear*4 + 0] = w0; + dst[linear*4 + 1] = w1; + dst[linear*4 + 2] = w2; + dst[linear*4 + 3] = w3; +} + +__attribute__((qcom_max_concurrent_subgroups(12))) +__kernel void kernel_gemm_xmem_f16_f32_os8( + __constant half8 * weights_buffer __attribute__((sub_group_uniform)), + __constant half8 * xmem_buffer __attribute__((max_constant_size((6144)))), + __read_only image2d_t src_img, + __write_only image2d_t dst_img, + int N, + int npack, + int kpack) { + const int X = get_group_id(1)*get_local_size(0) + get_local_id(0); + const int Z = get_group_id(0)*get_local_size(2) + get_local_id(2); + + if (X >= N || Z*8 >= npack) { + return; + } + + half4 r0 = (half4)(0.0h); + half4 r1 = (half4)(0.0h); + half4 r2 = (half4)(0.0h); + half4 r3 = (half4)(0.0h); + half4 r4 = (half4)(0.0h); + half4 r5 = (half4)(0.0h); + half4 r6 = (half4)(0.0h); + half4 r7 = (half4)(0.0h); + + int f_offset = Z*kpack*32; + int subgroup_id = (int)(0x1F & qcom_get_physical_sub_group_id()); + subgroup_id = subgroup_id % 12; + const int c_offset = subgroup_id*32; + __constant half16 * weights_cache = (__constant half16 *)&xmem_buffer[c_offset]; + + int coord_s = 0; + do { + const half4 src0 = read_imageh(src_img, smp_zero, (int2)(X, coord_s)); + coord_s++; + const half4 src1 = read_imageh(src_img, smp_zero, (int2)(X, coord_s)); + coord_s++; + + qcom_sub_group_constant_load8(xmem_buffer, weights_buffer, c_offset, f_offset >> 1, 32); + f_offset += 64; + qcom_sub_group_sync(QCOM_CLK_CONST_LOAD_SYNC); + + r0 += src0.x * weights_cache[0].s0123; + r0 += src0.y * weights_cache[0].s4567; + r0 += src0.z * weights_cache[0].s89ab; + r0 += src0.w * weights_cache[0].scdef; + r1 += src0.x * weights_cache[1].s0123; + r1 += src0.y * weights_cache[1].s4567; + r1 += src0.z * weights_cache[1].s89ab; + r1 += src0.w * weights_cache[1].scdef; + r2 += src0.x * weights_cache[2].s0123; + r2 += src0.y * weights_cache[2].s4567; + r2 += src0.z * weights_cache[2].s89ab; + r2 += src0.w * weights_cache[2].scdef; + r3 += src0.x * weights_cache[3].s0123; + r3 += src0.y * weights_cache[3].s4567; + r3 += src0.z * weights_cache[3].s89ab; + r3 += src0.w * weights_cache[3].scdef; + r4 += src0.x * weights_cache[4].s0123; + r4 += src0.y * weights_cache[4].s4567; + r4 += src0.z * weights_cache[4].s89ab; + r4 += src0.w * weights_cache[4].scdef; + r5 += src0.x * weights_cache[5].s0123; + r5 += src0.y * weights_cache[5].s4567; + r5 += src0.z * weights_cache[5].s89ab; + r5 += src0.w * weights_cache[5].scdef; + r6 += src0.x * weights_cache[6].s0123; + r6 += src0.y * weights_cache[6].s4567; + r6 += src0.z * weights_cache[6].s89ab; + r6 += src0.w * weights_cache[6].scdef; + r7 += src0.x * weights_cache[7].s0123; + r7 += src0.y * weights_cache[7].s4567; + r7 += src0.z * weights_cache[7].s89ab; + r7 += src0.w * weights_cache[7].scdef; + + r0 += src1.x * weights_cache[8].s0123; + r0 += src1.y * weights_cache[8].s4567; + r0 += src1.z * weights_cache[8].s89ab; + r0 += src1.w * weights_cache[8].scdef; + r1 += src1.x * weights_cache[9].s0123; + r1 += src1.y * weights_cache[9].s4567; + r1 += src1.z * weights_cache[9].s89ab; + r1 += src1.w * weights_cache[9].scdef; + r2 += src1.x * weights_cache[10].s0123; + r2 += src1.y * weights_cache[10].s4567; + r2 += src1.z * weights_cache[10].s89ab; + r2 += src1.w * weights_cache[10].scdef; + r3 += src1.x * weights_cache[11].s0123; + r3 += src1.y * weights_cache[11].s4567; + r3 += src1.z * weights_cache[11].s89ab; + r3 += src1.w * weights_cache[11].scdef; + r4 += src1.x * weights_cache[12].s0123; + r4 += src1.y * weights_cache[12].s4567; + r4 += src1.z * weights_cache[12].s89ab; + r4 += src1.w * weights_cache[12].scdef; + r5 += src1.x * weights_cache[13].s0123; + r5 += src1.y * weights_cache[13].s4567; + r5 += src1.z * weights_cache[13].s89ab; + r5 += src1.w * weights_cache[13].scdef; + r6 += src1.x * weights_cache[14].s0123; + r6 += src1.y * weights_cache[14].s4567; + r6 += src1.z * weights_cache[14].s89ab; + r6 += src1.w * weights_cache[14].scdef; + r7 += src1.x * weights_cache[15].s0123; + r7 += src1.y * weights_cache[15].s4567; + r7 += src1.z * weights_cache[15].s89ab; + r7 += src1.w * weights_cache[15].scdef; + } while (coord_s < kpack); + + int coord_s_out = Z*8; + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r0); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r1); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r2); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r3); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r4); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r5); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r6); coord_s_out++; } + if (coord_s_out < npack) { write_imageh(dst_img, (int2)(X, coord_s_out), r7); } +} + +__kernel void adreno_xmem_store_dst_f32( + __read_only image2d_t dst_img, + __global void * dst_void, + ulong offset, + int M, + int N) { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int npack = (M + 3) / 4; + + if (x >= N || y >= npack) { + return; + } + + __global float * dst = (__global float *)((__global char *)dst_void + offset); + const half4 hv = read_imageh(dst_img, smp_zero, (int2)(x, y)); + const int m = y*4; + if (m + 0 < M) dst[x*M + m + 0] = (float)hv.s0; + if (m + 1 < M) dst[x*M + m + 1] = (float)hv.s1; + if (m + 2 < M) dst[x*M + m + 2] = (float)hv.s2; + if (m + 3 < M) dst[x*M + m + 3] = (float)hv.s3; +}