From 1216e0957bc060bff1012b91acab590d7d2b37d9 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 13 Jun 2026 08:44:15 -0500 Subject: [PATCH] vulkan: support non-contig unary/glu ops (llama/24215) * vulkan: support non-contig unary/glu ops Change unary/glu ops to pass in all strides and use fastdiv for the index calculation. Put all unary ops in one file, similar to glu, to share the code. codex went ahead and added expm1 without me asking, but I had to make it do a real precision analysis rather than just making stuff up. unary.comp initially couldn't use generic_unary_head because there wasn't space for xielu's additional constants. Fixing this required packing the fastdiv 'L' values. * attempt to workaround compiler bug * resolve conflict from #23991 * use expm1 --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 163 ++++++++++++------ ggml/src/ggml-vulkan/vulkan-shaders/abs.comp | 21 --- ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp | 22 --- ggml/src/ggml-vulkan/vulkan-shaders/diag.comp | 6 +- ggml/src/ggml-vulkan/vulkan-shaders/elu.comp | 27 --- ggml/src/ggml-vulkan/vulkan-shaders/exp.comp | 20 --- .../src/ggml-vulkan/vulkan-shaders/floor.comp | 22 --- ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp | 25 --- .../ggml-vulkan/vulkan-shaders/gelu_erf.comp | 39 ----- .../vulkan-shaders/gelu_quick.comp | 23 --- .../vulkan-shaders/generic_unary_head.glsl | 40 +++-- .../ggml-vulkan/vulkan-shaders/glu_head.glsl | 27 ++- .../ggml-vulkan/vulkan-shaders/glu_main.glsl | 32 ++-- .../vulkan-shaders/hardsigmoid.comp | 22 --- .../ggml-vulkan/vulkan-shaders/hardswish.comp | 22 --- ggml/src/ggml-vulkan/vulkan-shaders/neg.comp | 20 --- ggml/src/ggml-vulkan/vulkan-shaders/relu.comp | 21 --- .../vulkan-shaders/repeat_back.comp | 6 +- ggml/src/ggml-vulkan/vulkan-shaders/roll.comp | 6 +- .../src/ggml-vulkan/vulkan-shaders/round.comp | 29 ---- ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp | 21 --- .../ggml-vulkan/vulkan-shaders/sigmoid.comp | 20 --- ggml/src/ggml-vulkan/vulkan-shaders/silu.comp | 22 --- .../ggml-vulkan/vulkan-shaders/softplus.comp | 23 --- ggml/src/ggml-vulkan/vulkan-shaders/step.comp | 22 --- ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp | 20 --- ggml/src/ggml-vulkan/vulkan-shaders/tri.comp | 6 +- .../src/ggml-vulkan/vulkan-shaders/trunc.comp | 22 --- .../src/ggml-vulkan/vulkan-shaders/unary.comp | 144 ++++++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 86 ++++----- .../src/ggml-vulkan/vulkan-shaders/xielu.comp | 35 ---- 31 files changed, 365 insertions(+), 649 deletions(-) delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/abs.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/elu.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/exp.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/floor.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/neg.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/relu.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/round.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/silu.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/step.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/unary.comp delete mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b0ad81123..5ab19a7d2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -833,6 +833,7 @@ struct vk_device_struct { // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_exp[2]; + vk_pipeline pipeline_expm1[2]; vk_pipeline pipeline_elu[2]; vk_pipeline pipeline_gelu[2]; vk_pipeline pipeline_gelu_erf[2]; @@ -1202,30 +1203,35 @@ struct vk_op_glu_push_constants { uint32_t mode; // 0: default, 1: swapped, 2: split float alpha; // for swiglu_oai float limit; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne01; - uint32_t ne02; + uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t ne11; - uint32_t ne12; + uint32_t nb20; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + uint32_t ne21; + uint32_t ne22; + uint32_t misalign_offsets; + uint32_t ne2_012mp; uint32_t ne2_012L; + uint32_t ne2_01mp; uint32_t ne2_01L; + uint32_t ne2_0mp; uint32_t ne2_0L; }; +static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128"); struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; - float param1; float param2; - uint32_t ne0_012mp; uint32_t ne0_012L; - uint32_t ne0_01mp; uint32_t ne0_01L; - uint32_t ne0_0mp; uint32_t ne0_0L; - uint32_t ne1_012mp; uint32_t ne1_012L; - uint32_t ne1_01mp; uint32_t ne1_01L; - uint32_t ne1_0mp; uint32_t ne1_0L; + float param1; float param2; float param3; float param4; + uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls; + uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls; }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); @@ -1330,6 +1336,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); } +static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) { + return L0 | (L1 << 8) | (L2 << 16); +} + template void init_pushconst_fastdiv(T &p) { GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); @@ -1337,12 +1347,29 @@ template void init_pushconst_fastdiv(T &p) { template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { // Compute magic values to divide by these six numbers. - init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); - init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); - init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); - init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); - init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); - init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); + uint32_t ne0_012L; + uint32_t ne0_01L; + uint32_t ne0_0L; + uint32_t ne1_012L; + uint32_t ne1_01L; + uint32_t ne1_0L; + + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L); + + p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L); + p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L); +} + +template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) { + // GLU linearizes over dst, then uses dst coordinates for src0/src1. + init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L); + init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L); + init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L); } struct vk_op_binary_push_constants { @@ -5006,8 +5033,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); #define CREATE_UNARY(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(elu) CREATE_UNARY(gelu) @@ -5030,6 +5057,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_UNARY(trunc) CREATE_UNARY(sgn) CREATE_UNARY(exp) + CREATE_UNARY(expm1) #undef CREATE_UNARY ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -8192,7 +8220,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8205,14 +8232,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = 1; + pc.nb11 = (uint32_t)tensor->ne[0]; + pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]); + pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]); init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -8226,7 +8250,6 @@ static void ggml_vk_cpy_to_strided( uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) { VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8239,14 +8262,11 @@ static void ggml_vk_cpy_to_strided( elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], nb10, nb11, nb12, nb13, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = nb10; + pc.nb11 = nb11; + pc.nb12 = nb12; + pc.nb13 = nb13; init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -10451,6 +10471,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_EXP: return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_EXPM1: + return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ELU: return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SILU: @@ -10849,6 +10871,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src3); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset; + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_ASSERT(a_offset < (1u << 8)); + GGML_ASSERT(b_offset < (1u << 8)); + GGML_ASSERT(d_offset < (1u << 8)); + + p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; + + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); @@ -12198,17 +12235,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, - { - (uint32_t)ggml_nelements(src0), 0, - op_params[1], op_params[2], op_params[3], op_params[4] - } - ); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[1]; + p.param2 = op_params[2]; + p.param3 = op_params[3]; + p.param4 = op_params[4]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p)); } static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -12228,6 +12265,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const } const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size; + const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, { @@ -12237,16 +12277,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const mode, alpha, limit, - (uint32_t)(src0->nb[1] / src0->nb[0]), - (uint32_t)(src0->nb[2] / src0->nb[0]), - (uint32_t)(src0->nb[3] / src0->nb[0]), - (uint32_t)src0->ne[1], - (uint32_t)src0->ne[2], - (uint32_t)(dst->nb[1] / dst->nb[0]), - (uint32_t)(dst->nb[2] / dst->nb[0]), - (uint32_t)(dst->nb[3] / dst->nb[0]), + (uint32_t)(src0->nb[0] / src0_type_size), + (uint32_t)(src0->nb[1] / src0_type_size), + (uint32_t)(src0->nb[2] / src0_type_size), + (uint32_t)(src0->nb[3] / src0_type_size), + (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size), + (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size), + (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size), + (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size), + (uint32_t)(dst->nb[0] / dst_type_size), + (uint32_t)(dst->nb[1] / dst_type_size), + (uint32_t)(dst->nb[2] / dst_type_size), + (uint32_t)(dst->nb[3] / dst_type_size), (uint32_t)dst->ne[1], - (uint32_t)dst->ne[2] + (uint32_t)dst->ne[2], + 0, + 0, 0, 0, 0, 0, 0, }); } @@ -14249,6 +14295,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -16638,6 +16685,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -16658,8 +16706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_TRUNC: case GGML_UNARY_OP_SGN: - return ggml_is_contiguous(op->src[0]) && - (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type); default: @@ -16675,7 +16722,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_GLU_OP_GEGLU_QUICK: return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && - (op->src[0]->type == op->type); + (op->src[0]->type == op->type) && + (!op->src[1] || op->src[1]->type == op->src[0]->type); default: return false; } @@ -17805,6 +17853,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_UNARY_OP_EXP: tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); break; + case GGML_UNARY_OP_EXPM1: + tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_ELU: tensor_clone = ggml_elu(ggml_ctx, src_clone[0]); break; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp deleted file mode 100644 index 07bd1c18d..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(abs(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp deleted file mode 100644 index 0028d3721..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(ceil(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp index 79761324f..249e6b16e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp @@ -12,11 +12,11 @@ void main() { return; } - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; if (i10 == i11) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp deleted file mode 100644 index 84dcbd8c8..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +++ /dev/null @@ -1,27 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - if (x < 0.0f) { - x = exp(x) - 1; - } - - data_d[i] = D_TYPE(x); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp deleted file mode 100644 index c7cf5ec68..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(exp(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp deleted file mode 100644 index 20017eb18..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(floor(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp deleted file mode 100644 index a95c2525c..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +++ /dev/null @@ -1,25 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); - data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp deleted file mode 100644 index 58375aba0..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +++ /dev/null @@ -1,39 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation - // ref: https://www.johndcook.com/blog/python_erf/ - const float p_erf = 0.3275911f; - const float a1_erf = 0.254829592f; - const float a2_erf = -0.284496736f; - const float a3_erf = 1.421413741f; - const float a4_erf = -1.453152027f; - const float a5_erf = 1.061405429f; - - const float SQRT_2_INV = 0.70710678118654752440084436210484f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float a = float(data_a[i]); - const float a_div_sqr2 = a * SQRT_2_INV; - const float sign_x = sign(a_div_sqr2); - const float x = abs(a_div_sqr2); - const float t = 1.0f / (1.0f + p_erf * x); - const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); - const float erf_approx = sign_x * y; - - data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp deleted file mode 100644 index bfdfe2182..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_QUICK_COEF = -1.702f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl index cc181fda8..9d4176f3f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl @@ -7,14 +7,12 @@ layout (push_constant) uniform parameter uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint misalign_offsets; - float param1; float param2; + float param1; float param2; float param3; float param4; - uint ne0_012mp; uint ne0_012L; - uint ne0_01mp; uint ne0_01L; - uint ne0_0mp; uint ne0_0L; - uint ne1_012mp; uint ne1_012L; - uint ne1_01mp; uint ne1_01L; - uint ne1_0mp; uint ne1_0L; + // The three L values are packed as bytes to keep this layout under the 128B + // push constant limit while still leaving room for four float parameters. + uint ne0_012mp; uint ne0_01mp; uint ne0_0mp; uint ne0_Ls; + uint ne1_012mp; uint ne1_01mp; uint ne1_0mp; uint ne1_Ls; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) { return (msbs + n) >> L; } +uint fastdiv_L(uint packed, uint slot) { + return (packed >> (slot * 8)) & 0x3Fu; +} + uint src0_idx(uint idx) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } uint src0_idx_quant(uint idx, uint qk) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00; } uint dst_idx_quant(uint idx, uint qk) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl index d8fdd8f7b..c3cae736f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl @@ -15,14 +15,33 @@ layout (push_constant) uniform parameter uint mode; float alpha; float limit; + uint nb00; uint nb01; uint nb02; uint nb03; - uint ne01; - uint ne02; + uint nb10; uint nb11; uint nb12; uint nb13; - uint ne11; - uint ne12; + uint nb20; + uint nb21; + uint nb22; + uint nb23; + uint ne21; + uint ne22; + uint misalign_offsets; + uint ne2_012mp; uint ne2_012L; + uint ne2_01mp; uint ne2_01L; + uint ne2_0mp; uint ne2_0L; } p; + +uint get_aoffset() { return p.misalign_offsets >> 16; } +uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; } +uint get_doffset() { return p.misalign_offsets & 0xFF; } + +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl index 359461306..14c5e7a54 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl @@ -5,35 +5,31 @@ void main() { return; } - const uint row = i / p.ne20; - const uint col = i - row * p.ne20; + const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L); + const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20; + const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L); + const uint i22_offset = i22*p.ne21*p.ne20; + const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L); + const uint i20 = i - i23_offset - i22_offset - i21*p.ne20; - const uint i3 = row / (p.ne01 * p.ne02); - const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01; - const uint i1 = row % p.ne01; - const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col; - - const uint dst_i3 = row / (p.ne11 * p.ne12); - const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11; - const uint dst_i1 = row % p.ne11; - const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col; + const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00; + const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10; + const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20; if (p.mode == 0) { // Default - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); } else if (p.mode == 1) { // Swapped - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); } else { // Split - const uint idx = src_idx; - - data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); + data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b]))); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp deleted file mode 100644 index b4dbdf314..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp deleted file mode 100644 index 1ec315915..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp b/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp deleted file mode 100644 index 7f9b1bce9..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(-float(data_a[i])); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp deleted file mode 100644 index 5725cef23..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(max(float(data_a[i]), 0)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp index 87df78294..10f334d42 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp @@ -13,11 +13,11 @@ void main() { } // Destination multi-index (inlined dst_idx) - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; const uint d_idx = i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp index 68fbd0c7b..dae811ad9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp @@ -20,11 +20,11 @@ void main() { return; } - const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i3 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; - const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L); + const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i2_offset = i2*p.ne11*p.ne10; - const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L); + const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; const uint p1 = floatBitsToUint(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp deleted file mode 100644 index e6155dcbf..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +++ /dev/null @@ -1,29 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - float result; - // Round halfway cases away from zero as roundf does. - if (x >= 0.0) { - result = floor(x + 0.5); - } else { - result = ceil(x - 0.5); - } - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp deleted file mode 100644 index a9c147bf9..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(sign(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp deleted file mode 100644 index 32298d43c..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i])))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp deleted file mode 100644 index 7d1cc6f45..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - data_d[i] = D_TYPE(xi / (1.0f + exp(-xi))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp b/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp deleted file mode 100644 index 323e3cdea..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - const float result = (x > 20.0f) ? x : log(1.0f + exp(x)); - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp b/ggml/src/ggml-vulkan/vulkan-shaders/step.comp deleted file mode 100644 index 654a2124e..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp deleted file mode 100644 index 7b5eb413b..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp index f9b78f960..9def5dbc9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp @@ -17,11 +17,11 @@ void main() { return; } - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; int param = floatBitsToInt(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp deleted file mode 100644 index cf1b76d3b..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(trunc(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp new file mode 100644 index 000000000..47a457399 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp @@ -0,0 +1,144 @@ +#version 450 + +#include "types.glsl" +#include "generic_unary_head.glsl" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +float op_abs(float x) { + return abs(x); +} + +float op_sgn(float x) { + return sign(x); +} + +float op_neg(float x) { + return -x; +} + +float op_step(float x) { + return x >= 0.0f ? 1.0f : 0.0f; +} + +float op_tanh(float x) { + return 1.0f - 2.0f / (exp(2.0f*x) + 1.0f); +} + +float op_elu(float x) { + return x < 0.0f ? exp(x) - 1.0f : x; +} + +float op_relu(float x) { + return max(x, 0.0f); +} + +float op_sigmoid(float x) { + return 1.0f / (1.0f + exp(-x)); +} + +float op_gelu(float x) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const float val = SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x); + return 0.5f*x*(2.0f - 2.0f / (exp(2.0f * val) + 1.0f)); +} + +float op_gelu_quick(float x) { + const float GELU_QUICK_COEF = -1.702f; + return x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))); +} + +float op_silu(float x) { + return x / (1.0f + exp(-x)); +} + +float op_hardswish(float x) { + return x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_hardsigmoid(float x) { + return min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_exp(float x) { + return exp(x); +} + +float op_expm1(float x) { + // exp(x) - 1 loses many ulps to cancellation near zero. Use a degree-6 + // Taylor expansion for |x| <= 1/4: the omitted x^7/5040 term is < 1.3e-8, + // about 0.5 ulp at expm1(0.25), and a host-side f32 model stays within + // 2 ulps over the interval. The first native exp(x)-1 values outside the + // cutoff are about 1 ulp for +0.25 and 2 ulps for -0.25. + if (abs(x) <= 0.25f) { + return x * (1.0f + x * (0.5f + x * ((1.0f/6.0f) + x * ((1.0f/24.0f) + x * ((1.0f/120.0f) + x * (1.0f/720.0f)))))); + } + return exp(x) - 1.0f; +} + +float op_softplus(float x) { + return (x > 20.0f) ? x : log(1.0f + exp(x)); +} + +float op_gelu_erf(float a) { + // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation + const float p_erf = 0.3275911f; + const float a1_erf = 0.254829592f; + const float a2_erf = -0.284496736f; + const float a3_erf = 1.421413741f; + const float a4_erf = -1.453152027f; + const float a5_erf = 1.061405429f; + + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + return 0.5f * a * (1.0f + sign_x * y); +} + +float op_xielu(float x) { + const float alpha_n = p.param1; + const float alpha_p = p.param2; + const float beta = p.param3; + const float eps = p.param4; + + if (x > 0.0f) { + return alpha_p * x * x + beta * x; + } + + const float min_x_eps = min(x, eps); + return (op_expm1(min_x_eps) - x) * alpha_n + beta * x; +} + +float op_floor(float x) { + return floor(x); +} + +float op_ceil(float x) { + return ceil(x); +} + +float op_round(float x) { + // Round halfway cases away from zero as roundf does. + return x >= 0.0f ? floor(x + 0.5f) : ceil(x - 0.5f); +} + +float op_trunc(float x) { + return trunc(x); +} + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const uint a_idx = get_aoffset() + src0_idx(idx); + const uint d_idx = get_doffset() + dst_idx(idx); + + data_d[d_idx] = D_TYPE(OP(float(data_a[a_idx]))); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 7bcb14608..dbbd0b193 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -868,47 +868,49 @@ void process_shaders() { string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("exp_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_exp"}}); + string_to_spv("exp_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_exp"}}); + string_to_spv("expm1_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_expm1"}}); + string_to_spv("expm1_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_expm1"}}); string_to_spv("log_f16", "log.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("log_f32", "log.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_erf_f32", "gelu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("relu_f16", "relu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("neg_f16", "neg.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("neg_f32", "neg.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("tanh_f16", "tanh.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("hardsigmoid_f16","hardsigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("hardsigmoid_f32","hardsigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("hardswish_f16", "hardswish.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("hardswish_f32", "hardswish.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("abs_f16", "abs.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("abs_f32", "abs.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("elu_f16", "elu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("elu_f32", "elu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("xielu_f16", "xielu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("xielu_f32", "xielu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("sgn_f16", "sgn.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("sgn_f32", "sgn.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("gelu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu"}}); + string_to_spv("gelu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu"}}); + string_to_spv("gelu_erf_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu_erf"}}); + string_to_spv("gelu_erf_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu_erf"}}); + string_to_spv("gelu_quick_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu_quick"}}); + string_to_spv("gelu_quick_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu_quick"}}); + string_to_spv("silu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_silu"}}); + string_to_spv("silu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_silu"}}); + string_to_spv("relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_relu"}}); + string_to_spv("relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_relu"}}); + string_to_spv("neg_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_neg"}}); + string_to_spv("neg_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_neg"}}); + string_to_spv("tanh_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}}); + string_to_spv("tanh_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_tanh"}}); + string_to_spv("sigmoid_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sigmoid"}}); + string_to_spv("sigmoid_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sigmoid"}}); + string_to_spv("hardsigmoid_f16","unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_hardsigmoid"}}); + string_to_spv("hardsigmoid_f32","unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_hardsigmoid"}}); + string_to_spv("hardswish_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_hardswish"}}); + string_to_spv("hardswish_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_hardswish"}}); + string_to_spv("abs_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_abs"}}); + string_to_spv("abs_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_abs"}}); + string_to_spv("elu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_elu"}}); + string_to_spv("elu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_elu"}}); + string_to_spv("xielu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_xielu"}}); + string_to_spv("xielu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_xielu"}}); + string_to_spv("sgn_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sgn"}}); + string_to_spv("sgn_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sgn"}}); string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("softplus_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_softplus"}}); + string_to_spv("softplus_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_softplus"}}); string_to_spv("add1_f16_f16", "add1.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); string_to_spv("add1_f16_f32", "add1.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); @@ -916,16 +918,16 @@ void process_shaders() { string_to_spv("arange_f32", "arange.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("fill_f32", "fill.comp", {{"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("fill_f16", "fill.comp", {{"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); - string_to_spv("step_f16", "step.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("step_f32", "step.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("round_f16", "round.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("round_f32", "round.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("ceil_f16", "ceil.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("ceil_f32", "ceil.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("floor_f16", "floor.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("floor_f32", "floor.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("trunc_f16", "trunc.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("step_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_step"}}); + string_to_spv("step_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_step"}}); + string_to_spv("round_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_round"}}); + string_to_spv("round_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_round"}}); + string_to_spv("ceil_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_ceil"}}); + string_to_spv("ceil_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_ceil"}}); + string_to_spv("floor_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_floor"}}); + string_to_spv("floor_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_floor"}}); + string_to_spv("trunc_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_trunc"}}); + string_to_spv("trunc_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_trunc"}}); string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp deleted file mode 100644 index 35d463bfe..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +++ /dev/null @@ -1,35 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - float alpha_n = p.param1; - float alpha_p = p.param2; - float beta = p.param3; - float eps = p.param4; - - if (x > 0.0f) { - x = alpha_p * x * x + beta * x; - } else { - const float min_x_eps = min(x, eps); - x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x; - } - - data_d[i] = D_TYPE(x); -}