vulkan: support non-contig unary/glu ops (llama/24215)

* vulkan: support non-contig unary/glu ops Change unary/glu ops to pass in all strides and use fastdiv for the index calculation. Put all unary ops in one file, similar to glu, to share the code. codex went ahead and added expm1 without me asking, but I had to make it do a real precision analysis rather than just making stuff up. unary.comp initially couldn't use generic_unary_head because there wasn't space for xielu's additional constants. Fixing this required packing the fastdiv 'L' values. * attempt to workaround compiler bug * resolve conflict from #23991 * use expm1
2026-06-13 08:44:15 -05:00 · 2026-06-13 08:44:15 -05:00 · 1216e0957b
parent dc195118ef
commit 1216e0957b
31 changed files with 365 additions and 649 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -833,6 +833,7 @@ struct vk_device_struct {

    // [src/dst 0=fp32,1=fp16]
    vk_pipeline pipeline_exp[2];
+    vk_pipeline pipeline_expm1[2];
    vk_pipeline pipeline_elu[2];
    vk_pipeline pipeline_gelu[2];
    vk_pipeline pipeline_gelu_erf[2];
@ -1202,30 +1203,35 @@ struct vk_op_glu_push_constants {
    uint32_t mode;  // 0: default, 1: swapped, 2: split
    float alpha; // for swiglu_oai
    float limit;
+    uint32_t nb00;
    uint32_t nb01;
    uint32_t nb02;
    uint32_t nb03;
-    uint32_t ne01;
-    uint32_t ne02;
+    uint32_t nb10;
    uint32_t nb11;
    uint32_t nb12;
    uint32_t nb13;
-    uint32_t ne11;
-    uint32_t ne12;
+    uint32_t nb20;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t ne21;
+    uint32_t ne22;
+    uint32_t misalign_offsets;
+    uint32_t ne2_012mp; uint32_t ne2_012L;
+    uint32_t ne2_01mp;  uint32_t ne2_01L;
+    uint32_t ne2_0mp;   uint32_t ne2_0L;
 };
+static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128");

 struct vk_op_unary_push_constants {
    uint32_t ne;
    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
    uint32_t misalign_offsets;
-    float param1; float param2;
-    uint32_t ne0_012mp; uint32_t ne0_012L;
-    uint32_t ne0_01mp;  uint32_t ne0_01L;
-    uint32_t ne0_0mp;   uint32_t ne0_0L;
-    uint32_t ne1_012mp; uint32_t ne1_012L;
-    uint32_t ne1_01mp;  uint32_t ne1_01L;
-    uint32_t ne1_0mp;   uint32_t ne1_0L;
+    float param1; float param2; float param3; float param4;
+    uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls;
+    uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls;
 };
 static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");

@ -1330,6 +1336,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
    mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
 }

+static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) {
+    return L0 | (L1 << 8) | (L2 << 16);
+}
+
 template <typename T> void init_pushconst_fastdiv(T &p) {
    GGML_UNUSED(p);
    static_assert(!std::is_const<T>::value, "unexpected type");
@ -1337,12 +1347,29 @@ template <typename T> void init_pushconst_fastdiv(T &p) {

 template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
    // Compute magic values to divide by these six numbers.
-    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    p.ne0_012L);
-    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     p.ne0_01L);
-    init_fastdiv_values(p.ne00,                p.ne0_0mp,      p.ne0_0L);
-    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    p.ne1_012L);
-    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     p.ne1_01L);
-    init_fastdiv_values(p.ne10,                p.ne1_0mp,      p.ne1_0L);
+    uint32_t ne0_012L;
+    uint32_t ne0_01L;
+    uint32_t ne0_0L;
+    uint32_t ne1_012L;
+    uint32_t ne1_01L;
+    uint32_t ne1_0L;
+
+    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    ne0_012L);
+    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     ne0_01L);
+    init_fastdiv_values(p.ne00,                p.ne0_0mp,      ne0_0L);
+    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    ne1_012L);
+    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     ne1_01L);
+    init_fastdiv_values(p.ne10,                p.ne1_0mp,      ne1_0L);
+
+    p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L);
+    p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L);
+}
+
+template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) {
+    // GLU linearizes over dst, then uses dst coordinates for src0/src1.
+    init_fastdiv_values(p.ne22*p.ne21*p.ne20,  p.ne2_012mp,    p.ne2_012L);
+    init_fastdiv_values(p.ne21*p.ne20,         p.ne2_01mp,     p.ne2_01L);
+    init_fastdiv_values(p.ne20,                p.ne2_0mp,      p.ne2_0L);
 }

 struct vk_op_binary_push_constants {
@ -5006,8 +5033,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

 #define CREATE_UNARY(name)  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);  \
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

    CREATE_UNARY(elu)
    CREATE_UNARY(gelu)
@ -5030,6 +5057,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    CREATE_UNARY(trunc)
    CREATE_UNARY(sgn)
    CREATE_UNARY(exp)
+    CREATE_UNARY(expm1)
 #undef CREATE_UNARY

    ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
@ -8192,7 +8220,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
 static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) {
    VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
-    const int tensor_type_size = ggml_type_size(tensor->type);

    const uint32_t ne = ggml_nelements(tensor);
    std::array<uint32_t, 3> elements;
@ -8205,14 +8232,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
        elements = { ne, 1, 1 };
    }

-    vk_op_unary_push_constants pc = {
-        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
-        0,
-        0.0f, 0.0f,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
+    vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
+    pc.nb10 = 1;
+    pc.nb11 = (uint32_t)tensor->ne[0];
+    pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]);
+    pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]);
    init_pushconst_fastdiv(pc);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
    ggml_vk_sync_buffers(ctx, subctx);
@ -8226,7 +8250,6 @@ static void ggml_vk_cpy_to_strided(
        uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) {
    VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
-    const int tensor_type_size = ggml_type_size(tensor->type);

    const uint32_t ne = ggml_nelements(tensor);
    std::array<uint32_t, 3> elements;
@ -8239,14 +8262,11 @@ static void ggml_vk_cpy_to_strided(
        elements = { ne, 1, 1 };
    }

-    vk_op_unary_push_constants pc = {
-        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], nb10, nb11, nb12, nb13,
-        0,
-        0.0f, 0.0f,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
+    vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
+    pc.nb10 = nb10;
+    pc.nb11 = nb11;
+    pc.nb12 = nb12;
+    pc.nb13 = nb13;
    init_pushconst_fastdiv(pc);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
    ggml_vk_sync_buffers(ctx, subctx);
@ -10451,6 +10471,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        switch (ggml_get_unary_op(dst)) {
            case GGML_UNARY_OP_EXP:
                return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16];
+            case GGML_UNARY_OP_EXPM1:
+                return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_ELU:
                return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_SILU:
@ -10849,6 +10871,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
    GGML_UNUSED(src3);
 }

+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset;
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+
+    GGML_ASSERT(a_offset < (1u << 8));
+    GGML_ASSERT(b_offset < (1u << 8));
+    GGML_ASSERT(d_offset < (1u << 8));
+
+    p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
+
+    GGML_UNUSED(src2);
+    GGML_UNUSED(src3);
+}
+
 template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
@ -12198,17 +12235,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c
 }

 static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst));
 }

 static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY,
-        {
-            (uint32_t)ggml_nelements(src0), 0,
-            op_params[1], op_params[2], op_params[3], op_params[4]
-        }
-    );
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
+    p.param1 = op_params[1];
+    p.param2 = op_params[2];
+    p.param3 = op_params[3];
+    p.param4 = op_params[4];
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p));
 }

 static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -12228,6 +12265,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
    }

    const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size;
+    const uint32_t dst_type_size  = ggml_type_size(dst->type);

    ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU,
        {
@ -12237,16 +12277,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
            mode,
            alpha,
            limit,
-            (uint32_t)(src0->nb[1] / src0->nb[0]),
-            (uint32_t)(src0->nb[2] / src0->nb[0]),
-            (uint32_t)(src0->nb[3] / src0->nb[0]),
-            (uint32_t)src0->ne[1],
-            (uint32_t)src0->ne[2],
-            (uint32_t)(dst->nb[1] / dst->nb[0]),
-            (uint32_t)(dst->nb[2] / dst->nb[0]),
-            (uint32_t)(dst->nb[3] / dst->nb[0]),
+            (uint32_t)(src0->nb[0] / src0_type_size),
+            (uint32_t)(src0->nb[1] / src0_type_size),
+            (uint32_t)(src0->nb[2] / src0_type_size),
+            (uint32_t)(src0->nb[3] / src0_type_size),
+            (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size),
+            (uint32_t)(dst->nb[0] / dst_type_size),
+            (uint32_t)(dst->nb[1] / dst_type_size),
+            (uint32_t)(dst->nb[2] / dst_type_size),
+            (uint32_t)(dst->nb[3] / dst_type_size),
            (uint32_t)dst->ne[1],
-            (uint32_t)dst->ne[2]
+            (uint32_t)dst->ne[2],
+            0,
+            0, 0, 0, 0, 0, 0,
        });
 }

@ -14249,6 +14295,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_ELU:
        case GGML_UNARY_OP_EXP:
+        case GGML_UNARY_OP_EXPM1:
        case GGML_UNARY_OP_SILU:
        case GGML_UNARY_OP_GELU:
        case GGML_UNARY_OP_GELU_ERF:
@ -16638,6 +16685,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_EXPM1:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_ERF:
@ -16658,8 +16706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_UNARY_OP_FLOOR:
                case GGML_UNARY_OP_TRUNC:
                case GGML_UNARY_OP_SGN:
-                    return ggml_is_contiguous(op->src[0]) &&
-                           (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
+                    return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
                           (op->src[0]->type == op->type);
                default:
@ -16675,7 +16722,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_GLU_OP_GEGLU_QUICK:
                    return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
-                           (op->src[0]->type == op->type);
+                           (op->src[0]->type == op->type) &&
+                           (!op->src[1] || op->src[1]->type == op->src[0]->type);
                default:
                    return false;
            }
@ -17805,6 +17853,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            case GGML_UNARY_OP_EXP:
                tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
                break;
+            case GGML_UNARY_OP_EXPM1:
+                tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]);
+                break;
            case GGML_UNARY_OP_ELU:
                tensor_clone = ggml_elu(ggml_ctx, src_clone[0]);
                break;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp
@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(abs(float(data_a[i])));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(ceil(x));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
@ -12,11 +12,11 @@ void main() {
        return;
    }

-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;

    if (i10 == i11) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp
@ -1,27 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    float x = float(data_a[i]);
-
-    if (x < 0.0f) {
-        x = exp(x) - 1;
-    }
-
-    data_d[i] = D_TYPE(x);
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp
@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(exp(float(data_a[i])));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(floor(x));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
@ -1,25 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
-    data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
@ -1,39 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
-    // ref: https://www.johndcook.com/blog/python_erf/
-    const float p_erf  = 0.3275911f;
-    const float a1_erf = 0.254829592f;
-    const float a2_erf = -0.284496736f;
-    const float a3_erf = 1.421413741f;
-    const float a4_erf = -1.453152027f;
-    const float a5_erf = 1.061405429f;
-
-    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float a = float(data_a[i]);
-    const float a_div_sqr2 = a * SQRT_2_INV;
-    const float sign_x = sign(a_div_sqr2);
-    const float x = abs(a_div_sqr2);
-    const float t = 1.0f / (1.0f + p_erf * x);
-    const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
-    const float erf_approx = sign_x * y;
-
-    data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
@ -1,23 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const float GELU_QUICK_COEF = -1.702f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl
@ -7,14 +7,12 @@ layout (push_constant) uniform parameter
    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint misalign_offsets;
-    float param1; float param2;
+    float param1; float param2; float param3; float param4;

-    uint ne0_012mp; uint ne0_012L;
-    uint ne0_01mp;  uint ne0_01L;
-    uint ne0_0mp;   uint ne0_0L;
-    uint ne1_012mp; uint ne1_012L;
-    uint ne1_01mp;  uint ne1_01L;
-    uint ne1_0mp;   uint ne1_0L;
+    // The three L values are packed as bytes to keep this layout under the 128B
+    // push constant limit while still leaving room for four float parameters.
+    uint ne0_012mp; uint ne0_01mp;  uint ne0_0mp;  uint ne0_Ls;
+    uint ne1_012mp; uint ne1_01mp;  uint ne1_0mp;  uint ne1_Ls;
 } p;

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) {
    return (msbs + n) >> L;
 }

+uint fastdiv_L(uint packed, uint slot) {
+    return (packed >> (slot * 8)) & 0x3Fu;
+}
+
 uint src0_idx(uint idx) {
-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }

 uint dst_idx(uint idx) {
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
 }

 uint src0_idx_quant(uint idx, uint qk) {
-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
 }

 uint dst_idx_quant(uint idx, uint qk) {
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl
@ -15,14 +15,33 @@ layout (push_constant) uniform parameter
    uint mode;
    float alpha;
    float limit;
+    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
-    uint ne01;
-    uint ne02;
+    uint nb10;
    uint nb11;
    uint nb12;
    uint nb13;
-    uint ne11;
-    uint ne12;
+    uint nb20;
+    uint nb21;
+    uint nb22;
+    uint nb23;
+    uint ne21;
+    uint ne22;
+    uint misalign_offsets;
+    uint ne2_012mp; uint ne2_012L;
+    uint ne2_01mp;  uint ne2_01L;
+    uint ne2_0mp;   uint ne2_0L;
 } p;
+
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
+uint get_doffset() { return p.misalign_offsets & 0xFF; }
+
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl
@ -5,35 +5,31 @@ void main() {
        return;
    }

-    const uint row = i / p.ne20;
-    const uint col = i - row * p.ne20;
+    const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L);
+    const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
+    const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L);
+    const uint i22_offset = i22*p.ne21*p.ne20;
+    const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L);
+    const uint i20 = i - i23_offset - i22_offset - i21*p.ne20;

-    const uint i3 = row / (p.ne01 * p.ne02);
-    const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01;
-    const uint i1 = row % p.ne01;
-    const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col;
-
-    const uint dst_i3 = row / (p.ne11 * p.ne12);
-    const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11;
-    const uint dst_i1 = row % p.ne11;
-    const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col;
+    const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00;
+    const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10;
+    const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20;

    if (p.mode == 0) {
        // Default
-        const uint offset = p.ne00 / 2;
-        const uint idx = src_idx;
+        const uint offset = (p.ne00 / 2) * p.nb00;
+        const uint idx = src_idx_a;

        data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
    } else if (p.mode == 1) {
        // Swapped
-        const uint offset = p.ne00 / 2;
-        const uint idx = src_idx;
+        const uint offset = (p.ne00 / 2) * p.nb00;
+        const uint idx = src_idx_a;

        data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
    } else {
        // Split
-        const uint idx = src_idx;
-
-        data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
+        data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b])));
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp
@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(-float(data_a[i]));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(max(float(data_a[i]), 0));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp
@ -13,11 +13,11 @@ void main() {
    }

    // Destination multi-index (inlined dst_idx)
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    const uint d_idx = i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp
@ -20,11 +20,11 @@ void main() {
        return;
    }

-    const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i3 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
-    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i2_offset = i2*p.ne11*p.ne10;
-    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;

    const uint p1 = floatBitsToUint(p.param1);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp
@ -1,29 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    float result;
-    // Round halfway cases away from zero as roundf does.
-    if (x >= 0.0) {
-        result = floor(x + 0.5);
-    } else {
-        result = ceil(x - 0.5);
-    }
-    data_d[i] = D_TYPE(result);
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp
@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(sign(float(data_a[i])));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp
@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i]))));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp
@ -1,23 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    const float result = (x > 20.0f) ? x : log(1.0f + exp(x));
-    data_d[i] = D_TYPE(result);
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/step.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f);
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp
@ -17,11 +17,11 @@ void main() {
        return;
    }

-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;

    int param = floatBitsToInt(p.param1);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp
@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(trunc(x));
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp
@ -0,0 +1,144 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+float op_abs(float x) {
+    return abs(x);
+}
+
+float op_sgn(float x) {
+    return sign(x);
+}
+
+float op_neg(float x) {
+    return -x;
+}
+
+float op_step(float x) {
+    return x >= 0.0f ? 1.0f : 0.0f;
+}
+
+float op_tanh(float x) {
+    return 1.0f - 2.0f / (exp(2.0f*x) + 1.0f);
+}
+
+float op_elu(float x) {
+    return x < 0.0f ? exp(x) - 1.0f : x;
+}
+
+float op_relu(float x) {
+    return max(x, 0.0f);
+}
+
+float op_sigmoid(float x) {
+    return 1.0f / (1.0f + exp(-x));
+}
+
+float op_gelu(float x) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const float val = SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x);
+    return 0.5f*x*(2.0f - 2.0f / (exp(2.0f * val) + 1.0f));
+}
+
+float op_gelu_quick(float x) {
+    const float GELU_QUICK_COEF = -1.702f;
+    return x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)));
+}
+
+float op_silu(float x) {
+    return x / (1.0f + exp(-x));
+}
+
+float op_hardswish(float x) {
+    return x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f));
+}
+
+float op_hardsigmoid(float x) {
+    return min(1.0f, max(0.0f, (x + 3.0f) / 6.0f));
+}
+
+float op_exp(float x) {
+    return exp(x);
+}
+
+float op_expm1(float x) {
+    // exp(x) - 1 loses many ulps to cancellation near zero.  Use a degree-6
+    // Taylor expansion for |x| <= 1/4: the omitted x^7/5040 term is < 1.3e-8,
+    // about 0.5 ulp at expm1(0.25), and a host-side f32 model stays within
+    // 2 ulps over the interval.  The first native exp(x)-1 values outside the
+    // cutoff are about 1 ulp for +0.25 and 2 ulps for -0.25.
+    if (abs(x) <= 0.25f) {
+        return x * (1.0f + x * (0.5f + x * ((1.0f/6.0f) + x * ((1.0f/24.0f) + x * ((1.0f/120.0f) + x * (1.0f/720.0f))))));
+    }
+    return exp(x) - 1.0f;
+}
+
+float op_softplus(float x) {
+    return (x > 20.0f) ? x : log(1.0f + exp(x));
+}
+
+float op_gelu_erf(float a) {
+    // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
+    const float p_erf  = 0.3275911f;
+    const float a1_erf = 0.254829592f;
+    const float a2_erf = -0.284496736f;
+    const float a3_erf = 1.421413741f;
+    const float a4_erf = -1.453152027f;
+    const float a5_erf = 1.061405429f;
+
+    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
+    const float a_div_sqr2 = a * SQRT_2_INV;
+    const float sign_x = sign(a_div_sqr2);
+    const float x = abs(a_div_sqr2);
+    const float t = 1.0f / (1.0f + p_erf * x);
+    const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
+    return 0.5f * a * (1.0f + sign_x * y);
+}
+
+float op_xielu(float x) {
+    const float alpha_n = p.param1;
+    const float alpha_p = p.param2;
+    const float beta = p.param3;
+    const float eps = p.param4;
+
+    if (x > 0.0f) {
+        return alpha_p * x * x + beta * x;
+    }
+
+    const float min_x_eps = min(x, eps);
+    return (op_expm1(min_x_eps) - x) * alpha_n + beta * x;
+}
+
+float op_floor(float x) {
+    return floor(x);
+}
+
+float op_ceil(float x) {
+    return ceil(x);
+}
+
+float op_round(float x) {
+    // Round halfway cases away from zero as roundf does.
+    return x >= 0.0f ? floor(x + 0.5f) : ceil(x - 0.5f);
+}
+
+float op_trunc(float x) {
+    return trunc(x);
+}
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint a_idx = get_aoffset() + src0_idx(idx);
+    const uint d_idx = get_doffset() + dst_idx(idx);
+
+    data_d[d_idx] = D_TYPE(OP(float(data_a[a_idx])));
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -868,47 +868,49 @@ void process_shaders() {

    string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});

-    string_to_spv("exp_f16",        "exp.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("exp_f32",        "exp.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("exp_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_exp"}});
+    string_to_spv("exp_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_exp"}});
+    string_to_spv("expm1_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_expm1"}});
+    string_to_spv("expm1_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_expm1"}});

    string_to_spv("log_f16",        "log.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("log_f32",        "log.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_f16",       "gelu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_f32",       "gelu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_erf_f16",   "gelu_erf.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_erf_f32",   "gelu_erf.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_quick_f16", "gelu_quick.comp",  {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_quick_f32", "gelu_quick.comp",  {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("silu_f16",       "silu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("silu_f32",       "silu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("relu_f16",       "relu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("relu_f32",       "relu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("neg_f16",        "neg.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("neg_f32",        "neg.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("tanh_f16",       "tanh.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("tanh_f32",       "tanh.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("sigmoid_f16",    "sigmoid.comp",     {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("sigmoid_f32",    "sigmoid.comp",     {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("hardsigmoid_f16","hardsigmoid.comp", {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("hardsigmoid_f32","hardsigmoid.comp", {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("hardswish_f16",  "hardswish.comp",   {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("hardswish_f32",  "hardswish.comp",   {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("abs_f16",        "abs.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("abs_f32",        "abs.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("elu_f16",        "elu.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("elu_f32",        "elu.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("xielu_f16",      "xielu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("xielu_f32",      "xielu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("sgn_f16",        "sgn.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("sgn_f32",        "sgn.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("gelu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu"}});
+    string_to_spv("gelu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu"}});
+    string_to_spv("gelu_erf_f16",   "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu_erf"}});
+    string_to_spv("gelu_erf_f32",   "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu_erf"}});
+    string_to_spv("gelu_quick_f16", "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu_quick"}});
+    string_to_spv("gelu_quick_f32", "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu_quick"}});
+    string_to_spv("silu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_silu"}});
+    string_to_spv("silu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_silu"}});
+    string_to_spv("relu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_relu"}});
+    string_to_spv("relu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_relu"}});
+    string_to_spv("neg_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_neg"}});
+    string_to_spv("neg_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_neg"}});
+    string_to_spv("tanh_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}});
+    string_to_spv("tanh_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_tanh"}});
+    string_to_spv("sigmoid_f16",    "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_sigmoid"}});
+    string_to_spv("sigmoid_f32",    "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_sigmoid"}});
+    string_to_spv("hardsigmoid_f16","unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_hardsigmoid"}});
+    string_to_spv("hardsigmoid_f32","unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_hardsigmoid"}});
+    string_to_spv("hardswish_f16",  "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_hardswish"}});
+    string_to_spv("hardswish_f32",  "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_hardswish"}});
+    string_to_spv("abs_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_abs"}});
+    string_to_spv("abs_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_abs"}});
+    string_to_spv("elu_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_elu"}});
+    string_to_spv("elu_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_elu"}});
+    string_to_spv("xielu_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_xielu"}});
+    string_to_spv("xielu_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_xielu"}});
+    string_to_spv("sgn_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_sgn"}});
+    string_to_spv("sgn_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_sgn"}});

    string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("diag_f16",       "diag.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("diag_f32",       "diag.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});

-    string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("softplus_f16",   "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_softplus"}});
+    string_to_spv("softplus_f32",   "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_softplus"}});

    string_to_spv("add1_f16_f16",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("add1_f16_f32",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
@ -916,16 +918,16 @@ void process_shaders() {
    string_to_spv("arange_f32",     "arange.comp",      {{"A_TYPE", "float"},       {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("fill_f32",       "fill.comp",        {{"D_TYPE", "float"},       {"FLOAT_TYPE", "float"}});
    string_to_spv("fill_f16",       "fill.comp",        {{"D_TYPE", "float16_t"},   {"FLOAT_TYPE", "float"}});
-    string_to_spv("step_f16",       "step.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("step_f32",       "step.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("round_f16",      "round.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("round_f32",      "round.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("ceil_f16",       "ceil.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("ceil_f32",       "ceil.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("floor_f16",      "floor.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("floor_f32",      "floor.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("trunc_f16",      "trunc.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("trunc_f32",      "trunc.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("step_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_step"}});
+    string_to_spv("step_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_step"}});
+    string_to_spv("round_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_round"}});
+    string_to_spv("round_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_round"}});
+    string_to_spv("ceil_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_ceil"}});
+    string_to_spv("ceil_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_ceil"}});
+    string_to_spv("floor_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_floor"}});
+    string_to_spv("floor_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_floor"}});
+    string_to_spv("trunc_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_trunc"}});
+    string_to_spv("trunc_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_trunc"}});

    string_to_spv("geglu_f16",      "geglu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("geglu_f32",      "geglu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
--- a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
@ -1,35 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    float x = float(data_a[i]);
-
-    float alpha_n = p.param1;
-    float alpha_p = p.param2;
-    float beta = p.param3;
-    float eps = p.param4;
-
-    if (x > 0.0f) {
-        x = alpha_p * x * x + beta * x;
-    } else {
-        const float min_x_eps = min(x, eps);
-        x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x;
-    }
-
-    data_d[i] = D_TYPE(x);
-}