diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index b0ad81123..5ab19a7d2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -833,6 +833,7 @@ struct vk_device_struct { // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_exp[2]; + vk_pipeline pipeline_expm1[2]; vk_pipeline pipeline_elu[2]; vk_pipeline pipeline_gelu[2]; vk_pipeline pipeline_gelu_erf[2]; @@ -1202,30 +1203,35 @@ struct vk_op_glu_push_constants { uint32_t mode; // 0: default, 1: swapped, 2: split float alpha; // for swiglu_oai float limit; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne01; - uint32_t ne02; + uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t ne11; - uint32_t ne12; + uint32_t nb20; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + uint32_t ne21; + uint32_t ne22; + uint32_t misalign_offsets; + uint32_t ne2_012mp; uint32_t ne2_012L; + uint32_t ne2_01mp; uint32_t ne2_01L; + uint32_t ne2_0mp; uint32_t ne2_0L; }; +static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128"); struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; - float param1; float param2; - uint32_t ne0_012mp; uint32_t ne0_012L; - uint32_t ne0_01mp; uint32_t ne0_01L; - uint32_t ne0_0mp; uint32_t ne0_0L; - uint32_t ne1_012mp; uint32_t ne1_012L; - uint32_t ne1_01mp; uint32_t ne1_01L; - uint32_t ne1_0mp; uint32_t ne1_0L; + float param1; float param2; float param3; float param4; + uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls; + uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls; }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); @@ -1330,6 +1336,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); } +static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) { + return L0 | (L1 << 8) | (L2 << 16); +} + template void init_pushconst_fastdiv(T &p) { GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); @@ -1337,12 +1347,29 @@ template void init_pushconst_fastdiv(T &p) { template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { // Compute magic values to divide by these six numbers. - init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); - init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); - init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); - init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); - init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); - init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); + uint32_t ne0_012L; + uint32_t ne0_01L; + uint32_t ne0_0L; + uint32_t ne1_012L; + uint32_t ne1_01L; + uint32_t ne1_0L; + + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L); + + p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L); + p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L); +} + +template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) { + // GLU linearizes over dst, then uses dst coordinates for src0/src1. + init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L); + init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L); + init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L); } struct vk_op_binary_push_constants { @@ -5006,8 +5033,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); #define CREATE_UNARY(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(elu) CREATE_UNARY(gelu) @@ -5030,6 +5057,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_UNARY(trunc) CREATE_UNARY(sgn) CREATE_UNARY(exp) + CREATE_UNARY(expm1) #undef CREATE_UNARY ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -8192,7 +8220,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8205,14 +8232,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = 1; + pc.nb11 = (uint32_t)tensor->ne[0]; + pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]); + pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]); init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -8226,7 +8250,6 @@ static void ggml_vk_cpy_to_strided( uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) { VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8239,14 +8262,11 @@ static void ggml_vk_cpy_to_strided( elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], nb10, nb11, nb12, nb13, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = nb10; + pc.nb11 = nb11; + pc.nb12 = nb12; + pc.nb13 = nb13; init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -10451,6 +10471,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_EXP: return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_EXPM1: + return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ELU: return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SILU: @@ -10849,6 +10871,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src3); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset; + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_ASSERT(a_offset < (1u << 8)); + GGML_ASSERT(b_offset < (1u << 8)); + GGML_ASSERT(d_offset < (1u << 8)); + + p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; + + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); @@ -12198,17 +12235,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, - { - (uint32_t)ggml_nelements(src0), 0, - op_params[1], op_params[2], op_params[3], op_params[4] - } - ); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[1]; + p.param2 = op_params[2]; + p.param3 = op_params[3]; + p.param4 = op_params[4]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p)); } static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -12228,6 +12265,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const } const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size; + const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, { @@ -12237,16 +12277,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const mode, alpha, limit, - (uint32_t)(src0->nb[1] / src0->nb[0]), - (uint32_t)(src0->nb[2] / src0->nb[0]), - (uint32_t)(src0->nb[3] / src0->nb[0]), - (uint32_t)src0->ne[1], - (uint32_t)src0->ne[2], - (uint32_t)(dst->nb[1] / dst->nb[0]), - (uint32_t)(dst->nb[2] / dst->nb[0]), - (uint32_t)(dst->nb[3] / dst->nb[0]), + (uint32_t)(src0->nb[0] / src0_type_size), + (uint32_t)(src0->nb[1] / src0_type_size), + (uint32_t)(src0->nb[2] / src0_type_size), + (uint32_t)(src0->nb[3] / src0_type_size), + (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size), + (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size), + (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size), + (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size), + (uint32_t)(dst->nb[0] / dst_type_size), + (uint32_t)(dst->nb[1] / dst_type_size), + (uint32_t)(dst->nb[2] / dst_type_size), + (uint32_t)(dst->nb[3] / dst_type_size), (uint32_t)dst->ne[1], - (uint32_t)dst->ne[2] + (uint32_t)dst->ne[2], + 0, + 0, 0, 0, 0, 0, 0, }); } @@ -14249,6 +14295,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -16638,6 +16685,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -16658,8 +16706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_TRUNC: case GGML_UNARY_OP_SGN: - return ggml_is_contiguous(op->src[0]) && - (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type); default: @@ -16675,7 +16722,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_GLU_OP_GEGLU_QUICK: return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && - (op->src[0]->type == op->type); + (op->src[0]->type == op->type) && + (!op->src[1] || op->src[1]->type == op->src[0]->type); default: return false; } @@ -17805,6 +17853,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_UNARY_OP_EXP: tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); break; + case GGML_UNARY_OP_EXPM1: + tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_ELU: tensor_clone = ggml_elu(ggml_ctx, src_clone[0]); break; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp deleted file mode 100644 index 07bd1c18d..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(abs(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp deleted file mode 100644 index 0028d3721..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(ceil(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp index 79761324f..249e6b16e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp @@ -12,11 +12,11 @@ void main() { return; } - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; if (i10 == i11) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp deleted file mode 100644 index 84dcbd8c8..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +++ /dev/null @@ -1,27 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - if (x < 0.0f) { - x = exp(x) - 1; - } - - data_d[i] = D_TYPE(x); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp deleted file mode 100644 index c7cf5ec68..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(exp(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp deleted file mode 100644 index 20017eb18..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(floor(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp deleted file mode 100644 index a95c2525c..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +++ /dev/null @@ -1,25 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); - data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp deleted file mode 100644 index 58375aba0..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +++ /dev/null @@ -1,39 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation - // ref: https://www.johndcook.com/blog/python_erf/ - const float p_erf = 0.3275911f; - const float a1_erf = 0.254829592f; - const float a2_erf = -0.284496736f; - const float a3_erf = 1.421413741f; - const float a4_erf = -1.453152027f; - const float a5_erf = 1.061405429f; - - const float SQRT_2_INV = 0.70710678118654752440084436210484f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float a = float(data_a[i]); - const float a_div_sqr2 = a * SQRT_2_INV; - const float sign_x = sign(a_div_sqr2); - const float x = abs(a_div_sqr2); - const float t = 1.0f / (1.0f + p_erf * x); - const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); - const float erf_approx = sign_x * y; - - data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp deleted file mode 100644 index bfdfe2182..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_QUICK_COEF = -1.702f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl index cc181fda8..9d4176f3f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl @@ -7,14 +7,12 @@ layout (push_constant) uniform parameter uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint misalign_offsets; - float param1; float param2; + float param1; float param2; float param3; float param4; - uint ne0_012mp; uint ne0_012L; - uint ne0_01mp; uint ne0_01L; - uint ne0_0mp; uint ne0_0L; - uint ne1_012mp; uint ne1_012L; - uint ne1_01mp; uint ne1_01L; - uint ne1_0mp; uint ne1_0L; + // The three L values are packed as bytes to keep this layout under the 128B + // push constant limit while still leaving room for four float parameters. + uint ne0_012mp; uint ne0_01mp; uint ne0_0mp; uint ne0_Ls; + uint ne1_012mp; uint ne1_01mp; uint ne1_0mp; uint ne1_Ls; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) { return (msbs + n) >> L; } +uint fastdiv_L(uint packed, uint slot) { + return (packed >> (slot * 8)) & 0x3Fu; +} + uint src0_idx(uint idx) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } uint src0_idx_quant(uint idx, uint qk) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00; } uint dst_idx_quant(uint idx, uint qk) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl index d8fdd8f7b..c3cae736f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl @@ -15,14 +15,33 @@ layout (push_constant) uniform parameter uint mode; float alpha; float limit; + uint nb00; uint nb01; uint nb02; uint nb03; - uint ne01; - uint ne02; + uint nb10; uint nb11; uint nb12; uint nb13; - uint ne11; - uint ne12; + uint nb20; + uint nb21; + uint nb22; + uint nb23; + uint ne21; + uint ne22; + uint misalign_offsets; + uint ne2_012mp; uint ne2_012L; + uint ne2_01mp; uint ne2_01L; + uint ne2_0mp; uint ne2_0L; } p; + +uint get_aoffset() { return p.misalign_offsets >> 16; } +uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; } +uint get_doffset() { return p.misalign_offsets & 0xFF; } + +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl index 359461306..14c5e7a54 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl @@ -5,35 +5,31 @@ void main() { return; } - const uint row = i / p.ne20; - const uint col = i - row * p.ne20; + const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L); + const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20; + const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L); + const uint i22_offset = i22*p.ne21*p.ne20; + const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L); + const uint i20 = i - i23_offset - i22_offset - i21*p.ne20; - const uint i3 = row / (p.ne01 * p.ne02); - const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01; - const uint i1 = row % p.ne01; - const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col; - - const uint dst_i3 = row / (p.ne11 * p.ne12); - const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11; - const uint dst_i1 = row % p.ne11; - const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col; + const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00; + const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10; + const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20; if (p.mode == 0) { // Default - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); } else if (p.mode == 1) { // Swapped - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); } else { // Split - const uint idx = src_idx; - - data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); + data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b]))); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp deleted file mode 100644 index b4dbdf314..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp deleted file mode 100644 index 1ec315915..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp b/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp deleted file mode 100644 index 7f9b1bce9..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(-float(data_a[i])); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp deleted file mode 100644 index 5725cef23..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(max(float(data_a[i]), 0)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp index 87df78294..10f334d42 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp @@ -13,11 +13,11 @@ void main() { } // Destination multi-index (inlined dst_idx) - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; const uint d_idx = i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp index 68fbd0c7b..dae811ad9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp @@ -20,11 +20,11 @@ void main() { return; } - const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i3 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; - const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L); + const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i2_offset = i2*p.ne11*p.ne10; - const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L); + const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; const uint p1 = floatBitsToUint(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp deleted file mode 100644 index e6155dcbf..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +++ /dev/null @@ -1,29 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - float result; - // Round halfway cases away from zero as roundf does. - if (x >= 0.0) { - result = floor(x + 0.5); - } else { - result = ceil(x - 0.5); - } - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp deleted file mode 100644 index a9c147bf9..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(sign(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp deleted file mode 100644 index 32298d43c..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i])))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp deleted file mode 100644 index 7d1cc6f45..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - data_d[i] = D_TYPE(xi / (1.0f + exp(-xi))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp b/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp deleted file mode 100644 index 323e3cdea..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - const float result = (x > 20.0f) ? x : log(1.0f + exp(x)); - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp b/ggml/src/ggml-vulkan/vulkan-shaders/step.comp deleted file mode 100644 index 654a2124e..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp deleted file mode 100644 index 7b5eb413b..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp index f9b78f960..9def5dbc9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp @@ -17,11 +17,11 @@ void main() { return; } - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; int param = floatBitsToInt(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp deleted file mode 100644 index cf1b76d3b..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(trunc(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp new file mode 100644 index 000000000..47a457399 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp @@ -0,0 +1,144 @@ +#version 450 + +#include "types.glsl" +#include "generic_unary_head.glsl" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +float op_abs(float x) { + return abs(x); +} + +float op_sgn(float x) { + return sign(x); +} + +float op_neg(float x) { + return -x; +} + +float op_step(float x) { + return x >= 0.0f ? 1.0f : 0.0f; +} + +float op_tanh(float x) { + return 1.0f - 2.0f / (exp(2.0f*x) + 1.0f); +} + +float op_elu(float x) { + return x < 0.0f ? exp(x) - 1.0f : x; +} + +float op_relu(float x) { + return max(x, 0.0f); +} + +float op_sigmoid(float x) { + return 1.0f / (1.0f + exp(-x)); +} + +float op_gelu(float x) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const float val = SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x); + return 0.5f*x*(2.0f - 2.0f / (exp(2.0f * val) + 1.0f)); +} + +float op_gelu_quick(float x) { + const float GELU_QUICK_COEF = -1.702f; + return x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))); +} + +float op_silu(float x) { + return x / (1.0f + exp(-x)); +} + +float op_hardswish(float x) { + return x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_hardsigmoid(float x) { + return min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_exp(float x) { + return exp(x); +} + +float op_expm1(float x) { + // exp(x) - 1 loses many ulps to cancellation near zero. Use a degree-6 + // Taylor expansion for |x| <= 1/4: the omitted x^7/5040 term is < 1.3e-8, + // about 0.5 ulp at expm1(0.25), and a host-side f32 model stays within + // 2 ulps over the interval. The first native exp(x)-1 values outside the + // cutoff are about 1 ulp for +0.25 and 2 ulps for -0.25. + if (abs(x) <= 0.25f) { + return x * (1.0f + x * (0.5f + x * ((1.0f/6.0f) + x * ((1.0f/24.0f) + x * ((1.0f/120.0f) + x * (1.0f/720.0f)))))); + } + return exp(x) - 1.0f; +} + +float op_softplus(float x) { + return (x > 20.0f) ? x : log(1.0f + exp(x)); +} + +float op_gelu_erf(float a) { + // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation + const float p_erf = 0.3275911f; + const float a1_erf = 0.254829592f; + const float a2_erf = -0.284496736f; + const float a3_erf = 1.421413741f; + const float a4_erf = -1.453152027f; + const float a5_erf = 1.061405429f; + + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + return 0.5f * a * (1.0f + sign_x * y); +} + +float op_xielu(float x) { + const float alpha_n = p.param1; + const float alpha_p = p.param2; + const float beta = p.param3; + const float eps = p.param4; + + if (x > 0.0f) { + return alpha_p * x * x + beta * x; + } + + const float min_x_eps = min(x, eps); + return (op_expm1(min_x_eps) - x) * alpha_n + beta * x; +} + +float op_floor(float x) { + return floor(x); +} + +float op_ceil(float x) { + return ceil(x); +} + +float op_round(float x) { + // Round halfway cases away from zero as roundf does. + return x >= 0.0f ? floor(x + 0.5f) : ceil(x - 0.5f); +} + +float op_trunc(float x) { + return trunc(x); +} + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const uint a_idx = get_aoffset() + src0_idx(idx); + const uint d_idx = get_doffset() + dst_idx(idx); + + data_d[d_idx] = D_TYPE(OP(float(data_a[a_idx]))); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 7bcb14608..dbbd0b193 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -868,47 +868,49 @@ void process_shaders() { string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("exp_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_exp"}}); + string_to_spv("exp_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_exp"}}); + string_to_spv("expm1_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_expm1"}}); + string_to_spv("expm1_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_expm1"}}); string_to_spv("log_f16", "log.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("log_f32", "log.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_erf_f32", "gelu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("relu_f16", "relu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("neg_f16", "neg.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("neg_f32", "neg.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("tanh_f16", "tanh.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("hardsigmoid_f16","hardsigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("hardsigmoid_f32","hardsigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("hardswish_f16", "hardswish.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("hardswish_f32", "hardswish.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("abs_f16", "abs.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("abs_f32", "abs.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("elu_f16", "elu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("elu_f32", "elu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("xielu_f16", "xielu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("xielu_f32", "xielu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("sgn_f16", "sgn.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("sgn_f32", "sgn.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("gelu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu"}}); + string_to_spv("gelu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu"}}); + string_to_spv("gelu_erf_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu_erf"}}); + string_to_spv("gelu_erf_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu_erf"}}); + string_to_spv("gelu_quick_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_gelu_quick"}}); + string_to_spv("gelu_quick_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_gelu_quick"}}); + string_to_spv("silu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_silu"}}); + string_to_spv("silu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_silu"}}); + string_to_spv("relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_relu"}}); + string_to_spv("relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_relu"}}); + string_to_spv("neg_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_neg"}}); + string_to_spv("neg_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_neg"}}); + string_to_spv("tanh_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}}); + string_to_spv("tanh_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_tanh"}}); + string_to_spv("sigmoid_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sigmoid"}}); + string_to_spv("sigmoid_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sigmoid"}}); + string_to_spv("hardsigmoid_f16","unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_hardsigmoid"}}); + string_to_spv("hardsigmoid_f32","unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_hardsigmoid"}}); + string_to_spv("hardswish_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_hardswish"}}); + string_to_spv("hardswish_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_hardswish"}}); + string_to_spv("abs_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_abs"}}); + string_to_spv("abs_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_abs"}}); + string_to_spv("elu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_elu"}}); + string_to_spv("elu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_elu"}}); + string_to_spv("xielu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_xielu"}}); + string_to_spv("xielu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_xielu"}}); + string_to_spv("sgn_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sgn"}}); + string_to_spv("sgn_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sgn"}}); string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("softplus_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_softplus"}}); + string_to_spv("softplus_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_softplus"}}); string_to_spv("add1_f16_f16", "add1.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); string_to_spv("add1_f16_f32", "add1.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); @@ -916,16 +918,16 @@ void process_shaders() { string_to_spv("arange_f32", "arange.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("fill_f32", "fill.comp", {{"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("fill_f16", "fill.comp", {{"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); - string_to_spv("step_f16", "step.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("step_f32", "step.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("round_f16", "round.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("round_f32", "round.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("ceil_f16", "ceil.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("ceil_f32", "ceil.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("floor_f16", "floor.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("floor_f32", "floor.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); - string_to_spv("trunc_f16", "trunc.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); - string_to_spv("trunc_f32", "trunc.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("step_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_step"}}); + string_to_spv("step_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_step"}}); + string_to_spv("round_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_round"}}); + string_to_spv("round_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_round"}}); + string_to_spv("ceil_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_ceil"}}); + string_to_spv("ceil_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_ceil"}}); + string_to_spv("floor_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_floor"}}); + string_to_spv("floor_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_floor"}}); + string_to_spv("trunc_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_trunc"}}); + string_to_spv("trunc_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_trunc"}}); string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp deleted file mode 100644 index 35d463bfe..000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +++ /dev/null @@ -1,35 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - float alpha_n = p.param1; - float alpha_p = p.param2; - float beta = p.param3; - float eps = p.param4; - - if (x > 0.0f) { - x = alpha_p * x * x + beta * x; - } else { - const float min_x_eps = min(x, eps); - x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x; - } - - data_d[i] = D_TYPE(x); -}