From 42aea65eda1b4100bedb42ea1cc7344da9f1054a Mon Sep 17 00:00:00 2001 From: Yanzhao Wang Date: Fri, 8 May 2026 17:12:04 -0700 Subject: [PATCH] hexagon: add HTP kernel for GGML_OP_GATED_DELTA_NET (llama/22837) Implement the Gated Delta Net recurrence on HVX with: - 4-row fused kernels for PP (prompt processing) path - 8-row fused kernels for TG (token generation) path, reducing K/Q/gate vector reload overhead by 2x - Separate PP/TG thread functions for I-cache isolation - VTCM state scratchpad with DMA in/out for TG single-cycle access - Vectorized gate exp via hvx_exp_f32 --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 111 +- ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 + .../ggml-hexagon/htp/gated-delta-net-ops.c | 955 ++++++++++++++++++ ggml/src/ggml-hexagon/htp/htp-ctx.h | 1 + ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 3 + 6 files changed, 1045 insertions(+), 27 deletions(-) create mode 100644 ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 8ddd1915c..d3c125dbc 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2261,6 +2261,58 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess return true; } +static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * q = op->src[0]; + const struct ggml_tensor * k = op->src[1]; + const struct ggml_tensor * v = op->src[2]; + const struct ggml_tensor * g = op->src[3]; + const struct ggml_tensor * beta = op->src[4]; + const struct ggml_tensor * state = op->src[5]; + const struct ggml_tensor * dst = op; + + if (!q || !k || !v || !g || !beta || !state) { + return false; + } + + if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 || + g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 || + dst->type != GGML_TYPE_F32) { + return false; + } + + if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) || + !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) || + !ggml_is_contiguous(dst)) { + return false; + } + + const int64_t S_v = v->ne[0]; + const int64_t H = v->ne[1]; + const int64_t n_tokens = v->ne[2]; + const int64_t n_seqs = v->ne[3]; + + if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) { + return false; + } + if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 || + q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 || + (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) { + return false; + } + if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) { + return false; + } + if (ggml_nelements(state) != S_v * S_v * H * n_seqs) { + return false; + } + if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) { + return false; + } + + GGML_UNUSED(sess); + return true; +} + static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -2777,33 +2829,34 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) { static htp_op_code op_remap_to_htp(const ggml_tensor * t) { switch (t->op) { - case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT; - case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT; - case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID; - case GGML_OP_MUL: return HTP_OP_MUL; - case GGML_OP_ADD: return HTP_OP_ADD; - case GGML_OP_ADD_ID: return HTP_OP_ADD_ID; - case GGML_OP_SUB: return HTP_OP_SUB; - case GGML_OP_DIV: return HTP_OP_DIV; - case GGML_OP_CPY: return HTP_OP_CPY; - case GGML_OP_CONT: return HTP_OP_CPY; - case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS; - case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; - case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; - case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; - case GGML_OP_L2_NORM: return HTP_OP_L2_NORM; - case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; - case GGML_OP_SCALE: return HTP_OP_SCALE; - case GGML_OP_SQR: return HTP_OP_SQR; - case GGML_OP_SQRT: return HTP_OP_SQRT; - case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX; - case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV; - case GGML_OP_ROPE: return HTP_OP_ROPE; - case GGML_OP_REPEAT: return HTP_OP_REPEAT; - case GGML_OP_CUMSUM: return HTP_OP_CUMSUM; - case GGML_OP_FILL: return HTP_OP_FILL; - case GGML_OP_DIAG: return HTP_OP_DIAG; - case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT; + case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT; + case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID; + case GGML_OP_MUL: return HTP_OP_MUL; + case GGML_OP_ADD: return HTP_OP_ADD; + case GGML_OP_ADD_ID: return HTP_OP_ADD_ID; + case GGML_OP_SUB: return HTP_OP_SUB; + case GGML_OP_DIV: return HTP_OP_DIV; + case GGML_OP_CPY: return HTP_OP_CPY; + case GGML_OP_CONT: return HTP_OP_CPY; + case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS; + case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; + case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; + case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; + case GGML_OP_L2_NORM: return HTP_OP_L2_NORM; + case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; + case GGML_OP_SCALE: return HTP_OP_SCALE; + case GGML_OP_SQR: return HTP_OP_SQR; + case GGML_OP_SQRT: return HTP_OP_SQRT; + case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX; + case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV; + case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET; + case GGML_OP_ROPE: return HTP_OP_ROPE; + case GGML_OP_REPEAT: return HTP_OP_REPEAT; + case GGML_OP_CUMSUM: return HTP_OP_CUMSUM; + case GGML_OP_FILL: return HTP_OP_FILL; + case GGML_OP_DIAG: return HTP_OP_DIAG; + case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; case GGML_OP_UNARY: switch (ggml_get_unary_op(t)) { case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; @@ -3341,6 +3394,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_ssm_conv(sess, op); break; + case GGML_OP_GATED_DELTA_NET: + supp = ggml_hexagon_supported_gated_delta_net(sess, op); + break; + case GGML_OP_CUMSUM: supp = ggml_hexagon_supported_cumsum(sess, op); break; diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index 7c9e4cda5..bcadac11f 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -37,6 +37,7 @@ add_library(${HTP_LIB} SHARED fill-ops.c diag-ops.c solve-tri-ops.c + gated-delta-net-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c new file mode 100644 index 000000000..2e84badc9 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c @@ -0,0 +1,955 @@ +#include +#include +#include + +#include "hvx-utils.h" + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#define HTP_GDN_MAX_SV 128 + +struct htp_gdn_context { + struct htp_ops_context * octx; + uint32_t rows_per_thread; + size_t state_bytes; + bool use_vtcm; + uint8_t * vtcm_state_base; + size_t vtcm_state_per_thread; +}; + +static inline float gdn_mul_dot_f32(float * restrict dst, const float * restrict mul, + const float * restrict dot, uint32_t n) { + HVX_Vector acc = Q6_V_vzero(); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vd = hvx_vmemu(dst + i * epv); + HVX_Vector vm = hvx_vmem(mul + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm); + hvx_vmemu(dst + i * epv) = out; + acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vd = hvx_vmemu(dst + off); + HVX_Vector vm = hvx_vmem(mul + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_Vector out = hvx_vec_mul_f32_f32(vd, vm); + hvx_vec_store_u(dst + off, tail * sizeof(float), out); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot); + acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero())); + } + + return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc)); +} + +static inline float gdn_mul_scalar_dot_f32(float * restrict dst, float mul, + const float * restrict dot, uint32_t n) { + HVX_Vector acc = Q6_V_vzero(); + const HVX_Vector vmul = hvx_vec_splat_f32(mul); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vd = hvx_vmemu(dst + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul); + hvx_vmemu(dst + i * epv) = out; + acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vd = hvx_vmemu(dst + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_Vector out = hvx_vec_mul_f32_f32(vd, vmul); + hvx_vec_store_u(dst + off, tail * sizeof(float), out); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot); + acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero())); + } + + return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc)); +} + +static inline float gdn_add_scaled_dot_f32(float * restrict dst, const float * restrict src, + float scale, const float * restrict dot, uint32_t n) { + HVX_Vector acc = Q6_V_vzero(); + const HVX_Vector vscale = hvx_vec_splat_f32(scale); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vd = hvx_vmemu(dst + i * epv); + HVX_Vector vs = hvx_vmem(src + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale)); + hvx_vmemu(dst + i * epv) = out; + acc = hvx_vec_add_f32_f32(acc, hvx_vec_mul_f32_f32(out, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vd = hvx_vmemu(dst + off); + HVX_Vector vs = hvx_vmem(src + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_Vector out = hvx_vec_add_f32_f32(vd, hvx_vec_mul_f32_f32(vs, vscale)); + hvx_vec_store_u(dst + off, tail * sizeof(float), out); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector prod = hvx_vec_mul_f32_f32(out, vdot); + acc = hvx_vec_add_f32_f32(acc, Q6_V_vmux_QVV(mask, prod, Q6_V_vzero())); + } + + return hvx_vec_get_f32(hvx_vec_reduce_sum_f32(acc)); +} + +static inline void gdn_mul_dot4_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, const float * restrict mul, + const float * restrict dot, uint32_t n, float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vm = hvx_vmem(mul + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vm = hvx_vmem(mul + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + } + + HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } }; + hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc)); +} + +static inline void gdn_mul_scalar_dot4_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, float mul, + const float * restrict dot, uint32_t n, float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + const HVX_Vector vmul = hvx_vec_splat_f32(mul); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + } + + HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } }; + hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc)); +} + +static inline void gdn_add_scaled_dot4_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, const float * restrict src, + const float * restrict scale, const float * restrict dot, uint32_t n, + float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]); + const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]); + const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]); + const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vs = hvx_vmem(src + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0)); + HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1)); + HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2)); + HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3)); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vs = hvx_vmem(src + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0)); + HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1)); + HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2)); + HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3)); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + } + + HVX_Vector_x4 acc = { .v = { acc0, acc1, acc2, acc3 } }; + hvx_vec_store_u(sums, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(acc)); +} + +static inline void gdn_mul_dot8_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, float * restrict dst4, + float * restrict dst5, float * restrict dst6, float * restrict dst7, + const float * restrict mul, const float * restrict dot, uint32_t n, + float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + HVX_Vector acc4 = Q6_V_vzero(); + HVX_Vector acc5 = Q6_V_vzero(); + HVX_Vector acc6 = Q6_V_vzero(); + HVX_Vector acc7 = Q6_V_vzero(); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vm = hvx_vmem(mul + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vm); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vm); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vm); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vm); + HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vm); + HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vm); + HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vm); + HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vm); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + hvx_vmemu(dst4 + i * epv) = out4; + hvx_vmemu(dst5 + i * epv) = out5; + hvx_vmemu(dst6 + i * epv) = out6; + hvx_vmemu(dst7 + i * epv) = out7; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot)); + acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot)); + acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot)); + acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vm = hvx_vmem(mul + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vm); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vm); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vm); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vm); + HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vm); + HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vm); + HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vm); + HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vm); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4); + hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5); + hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6); + hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero)); + acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero)); + acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero)); + acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero)); + } + + HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } }; + HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } }; + hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA)); + hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB)); +} + +static inline void gdn_mul_scalar_dot8_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, float * restrict dst4, + float * restrict dst5, float * restrict dst6, float * restrict dst7, + float mul, const float * restrict dot, uint32_t n, float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + HVX_Vector acc4 = Q6_V_vzero(); + HVX_Vector acc5 = Q6_V_vzero(); + HVX_Vector acc6 = Q6_V_vzero(); + HVX_Vector acc7 = Q6_V_vzero(); + const HVX_Vector vmul = hvx_vec_splat_f32(mul); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + i * epv), vmul); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + i * epv), vmul); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + i * epv), vmul); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + i * epv), vmul); + HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + i * epv), vmul); + HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + i * epv), vmul); + HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + i * epv), vmul); + HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + i * epv), vmul); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + hvx_vmemu(dst4 + i * epv) = out4; + hvx_vmemu(dst5 + i * epv) = out5; + hvx_vmemu(dst6 + i * epv) = out6; + hvx_vmemu(dst7 + i * epv) = out7; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot)); + acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot)); + acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot)); + acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_mul_f32_f32(hvx_vmemu(dst0 + off), vmul); + HVX_Vector out1 = hvx_vec_mul_f32_f32(hvx_vmemu(dst1 + off), vmul); + HVX_Vector out2 = hvx_vec_mul_f32_f32(hvx_vmemu(dst2 + off), vmul); + HVX_Vector out3 = hvx_vec_mul_f32_f32(hvx_vmemu(dst3 + off), vmul); + HVX_Vector out4 = hvx_vec_mul_f32_f32(hvx_vmemu(dst4 + off), vmul); + HVX_Vector out5 = hvx_vec_mul_f32_f32(hvx_vmemu(dst5 + off), vmul); + HVX_Vector out6 = hvx_vec_mul_f32_f32(hvx_vmemu(dst6 + off), vmul); + HVX_Vector out7 = hvx_vec_mul_f32_f32(hvx_vmemu(dst7 + off), vmul); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4); + hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5); + hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6); + hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero)); + acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero)); + acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero)); + acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero)); + } + + HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } }; + HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } }; + hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA)); + hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB)); +} + +static inline void gdn_add_scaled_dot8_f32(float * restrict dst0, float * restrict dst1, + float * restrict dst2, float * restrict dst3, float * restrict dst4, + float * restrict dst5, float * restrict dst6, float * restrict dst7, + const float * restrict src, const float * restrict scale, + const float * restrict dot, uint32_t n, float * restrict sums) { + HVX_Vector acc0 = Q6_V_vzero(); + HVX_Vector acc1 = Q6_V_vzero(); + HVX_Vector acc2 = Q6_V_vzero(); + HVX_Vector acc3 = Q6_V_vzero(); + HVX_Vector acc4 = Q6_V_vzero(); + HVX_Vector acc5 = Q6_V_vzero(); + HVX_Vector acc6 = Q6_V_vzero(); + HVX_Vector acc7 = Q6_V_vzero(); + const HVX_Vector scale0 = hvx_vec_splat_f32(scale[0]); + const HVX_Vector scale1 = hvx_vec_splat_f32(scale[1]); + const HVX_Vector scale2 = hvx_vec_splat_f32(scale[2]); + const HVX_Vector scale3 = hvx_vec_splat_f32(scale[3]); + const HVX_Vector scale4 = hvx_vec_splat_f32(scale[4]); + const HVX_Vector scale5 = hvx_vec_splat_f32(scale[5]); + const HVX_Vector scale6 = hvx_vec_splat_f32(scale[6]); + const HVX_Vector scale7 = hvx_vec_splat_f32(scale[7]); + + const uint32_t epv = 128 / sizeof(float); + const uint32_t nvec = n / epv; + const uint32_t tail = n % epv; + for (uint32_t i = 0; i < nvec; ++i) { + HVX_Vector vs = hvx_vmem(src + i * epv); + HVX_Vector vdot = hvx_vmem(dot + i * epv); + + HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + i * epv), hvx_vec_mul_f32_f32(vs, scale0)); + HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + i * epv), hvx_vec_mul_f32_f32(vs, scale1)); + HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + i * epv), hvx_vec_mul_f32_f32(vs, scale2)); + HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + i * epv), hvx_vec_mul_f32_f32(vs, scale3)); + HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + i * epv), hvx_vec_mul_f32_f32(vs, scale4)); + HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + i * epv), hvx_vec_mul_f32_f32(vs, scale5)); + HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + i * epv), hvx_vec_mul_f32_f32(vs, scale6)); + HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + i * epv), hvx_vec_mul_f32_f32(vs, scale7)); + + hvx_vmemu(dst0 + i * epv) = out0; + hvx_vmemu(dst1 + i * epv) = out1; + hvx_vmemu(dst2 + i * epv) = out2; + hvx_vmemu(dst3 + i * epv) = out3; + hvx_vmemu(dst4 + i * epv) = out4; + hvx_vmemu(dst5 + i * epv) = out5; + hvx_vmemu(dst6 + i * epv) = out6; + hvx_vmemu(dst7 + i * epv) = out7; + + acc0 = hvx_vec_add_f32_f32(acc0, hvx_vec_mul_f32_f32(out0, vdot)); + acc1 = hvx_vec_add_f32_f32(acc1, hvx_vec_mul_f32_f32(out1, vdot)); + acc2 = hvx_vec_add_f32_f32(acc2, hvx_vec_mul_f32_f32(out2, vdot)); + acc3 = hvx_vec_add_f32_f32(acc3, hvx_vec_mul_f32_f32(out3, vdot)); + acc4 = hvx_vec_add_f32_f32(acc4, hvx_vec_mul_f32_f32(out4, vdot)); + acc5 = hvx_vec_add_f32_f32(acc5, hvx_vec_mul_f32_f32(out5, vdot)); + acc6 = hvx_vec_add_f32_f32(acc6, hvx_vec_mul_f32_f32(out6, vdot)); + acc7 = hvx_vec_add_f32_f32(acc7, hvx_vec_mul_f32_f32(out7, vdot)); + } + + if (tail) { + const uint32_t off = nvec * epv; + HVX_Vector vs = hvx_vmem(src + off); + HVX_Vector vdot = hvx_vmem(dot + off); + HVX_VectorPred mask = Q6_Q_vsetq2_R(tail * sizeof(float)); + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector out0 = hvx_vec_add_f32_f32(hvx_vmemu(dst0 + off), hvx_vec_mul_f32_f32(vs, scale0)); + HVX_Vector out1 = hvx_vec_add_f32_f32(hvx_vmemu(dst1 + off), hvx_vec_mul_f32_f32(vs, scale1)); + HVX_Vector out2 = hvx_vec_add_f32_f32(hvx_vmemu(dst2 + off), hvx_vec_mul_f32_f32(vs, scale2)); + HVX_Vector out3 = hvx_vec_add_f32_f32(hvx_vmemu(dst3 + off), hvx_vec_mul_f32_f32(vs, scale3)); + HVX_Vector out4 = hvx_vec_add_f32_f32(hvx_vmemu(dst4 + off), hvx_vec_mul_f32_f32(vs, scale4)); + HVX_Vector out5 = hvx_vec_add_f32_f32(hvx_vmemu(dst5 + off), hvx_vec_mul_f32_f32(vs, scale5)); + HVX_Vector out6 = hvx_vec_add_f32_f32(hvx_vmemu(dst6 + off), hvx_vec_mul_f32_f32(vs, scale6)); + HVX_Vector out7 = hvx_vec_add_f32_f32(hvx_vmemu(dst7 + off), hvx_vec_mul_f32_f32(vs, scale7)); + + hvx_vec_store_u(dst0 + off, tail * sizeof(float), out0); + hvx_vec_store_u(dst1 + off, tail * sizeof(float), out1); + hvx_vec_store_u(dst2 + off, tail * sizeof(float), out2); + hvx_vec_store_u(dst3 + off, tail * sizeof(float), out3); + hvx_vec_store_u(dst4 + off, tail * sizeof(float), out4); + hvx_vec_store_u(dst5 + off, tail * sizeof(float), out5); + hvx_vec_store_u(dst6 + off, tail * sizeof(float), out6); + hvx_vec_store_u(dst7 + off, tail * sizeof(float), out7); + + acc0 = hvx_vec_add_f32_f32(acc0, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out0, vdot), zero)); + acc1 = hvx_vec_add_f32_f32(acc1, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out1, vdot), zero)); + acc2 = hvx_vec_add_f32_f32(acc2, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out2, vdot), zero)); + acc3 = hvx_vec_add_f32_f32(acc3, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out3, vdot), zero)); + acc4 = hvx_vec_add_f32_f32(acc4, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out4, vdot), zero)); + acc5 = hvx_vec_add_f32_f32(acc5, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out5, vdot), zero)); + acc6 = hvx_vec_add_f32_f32(acc6, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out6, vdot), zero)); + acc7 = hvx_vec_add_f32_f32(acc7, Q6_V_vmux_QVV(mask, hvx_vec_mul_f32_f32(out7, vdot), zero)); + } + + HVX_Vector_x4 accA = { .v = { acc0, acc1, acc2, acc3 } }; + HVX_Vector_x4 accB = { .v = { acc4, acc5, acc6, acc7 } }; + hvx_vec_store_u(sums + 0, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accA)); + hvx_vec_store_u(sums + 4, 4 * sizeof(float), hvx_vec_reduce_sum_f32x4(accB)); +} + +static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, void * data) { + struct htp_gdn_context * gctx = (struct htp_gdn_context *) data; + struct htp_ops_context * octx = gctx->octx; + + const struct htp_tensor * q = octx->src[0]; + const struct htp_tensor * k = octx->src[1]; + const struct htp_tensor * v = octx->src[2]; + const struct htp_tensor * g = octx->src[3]; + const struct htp_tensor * beta = octx->src[4]; + const struct htp_tensor * state = octx->src[5]; + const struct htp_tensor * dst = octx->dst; + + const uint32_t S_v = v->ne[0]; + const uint32_t H = v->ne[1]; + const uint32_t n_tokens = v->ne[2]; + const uint32_t n_seqs = v->ne[3]; + + const uint32_t total_rows = H * n_seqs; + if (ith >= total_rows) { + return; + } + + const uint32_t rq3 = n_seqs / q->ne[3]; + const uint32_t rk3 = n_seqs / k->ne[3]; + const float scale = 1.0f / sqrtf((float) S_v); + + float * dst_base = (float *) (uintptr_t) dst->data; + float * state_out_base = dst_base + (uint64_t) S_v * H * n_tokens * n_seqs; + const float * state_in_base = (const float *) (uintptr_t) state->data; + + const bool kda = (g->ne[0] == S_v); + float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_sums[4] __attribute__((aligned(128))); + + for (uint32_t ir = ith; ir < total_rows; ir += nth) { + const uint32_t iv1 = ir % H; + const uint32_t iv3 = ir / H; + + const uint32_t iq1 = iv1 % q->ne[1]; + const uint32_t ik1 = iv1 % k->ne[1]; + const uint32_t iq3 = iv3 / rq3; + const uint32_t ik3 = iv3 / rk3; + + float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + + memcpy(s_out, s_in, gctx->state_bytes); + float * s_work = s_out; + + float * attn_data = dst_base + ((uint64_t) iv3 * n_tokens * H + iv1) * S_v; + + for (uint32_t t = 0; t < n_tokens; ++t) { + const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data + + (uint64_t) iq3 * q->nb[3] + (uint64_t) t * q->nb[2] + (uint64_t) iq1 * q->nb[1]); + const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data + + (uint64_t) ik3 * k->nb[3] + (uint64_t) t * k->nb[2] + (uint64_t) ik1 * k->nb[1]); + const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data + + (uint64_t) iv3 * v->nb[3] + (uint64_t) t * v->nb[2] + (uint64_t) iv1 * v->nb[1]); + const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data + + (uint64_t) iv3 * g->nb[3] + (uint64_t) t * g->nb[2] + (uint64_t) iv1 * g->nb[1]); + const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data + + (uint64_t) iv3 * beta->nb[3] + (uint64_t) t * beta->nb[2] + (uint64_t) iv1 * beta->nb[1]); + + memcpy(local_q, q_t, (size_t) S_v * sizeof(float)); + memcpy(local_k, k_t, (size_t) S_v * sizeof(float)); + + if (kda) { + hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false); + + uint32_t j = 0; + for (; j + 4 <= S_v; j += 4) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums); + float local_delta_b[4] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 4; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 4; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j < S_v; ++j) { + float * row = s_work + (uint64_t) j * S_v; + const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v); + const float dj = (v_t[j] - sum) * beta_val; + attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale; + } + } else { + const float gate = expf(g_t[0]); + uint32_t j = 0; + for (; j + 4 <= S_v; j += 4) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums); + float local_delta_b[4] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 4; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 4; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j < S_v; ++j) { + float * row = s_work + (uint64_t) j * S_v; + const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v); + const float dj = (v_t[j] - sum) * beta_val; + attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale; + } + } + + attn_data += (uint64_t) S_v * H; + } + } +} + +static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, void * data) { + struct htp_gdn_context * gctx = (struct htp_gdn_context *) data; + struct htp_ops_context * octx = gctx->octx; + + const struct htp_tensor * q = octx->src[0]; + const struct htp_tensor * k = octx->src[1]; + const struct htp_tensor * v = octx->src[2]; + const struct htp_tensor * g = octx->src[3]; + const struct htp_tensor * beta = octx->src[4]; + const struct htp_tensor * state = octx->src[5]; + const struct htp_tensor * dst = octx->dst; + + const uint32_t S_v = v->ne[0]; + const uint32_t H = v->ne[1]; + const uint32_t n_seqs = v->ne[3]; + + const uint32_t total_rows = H * n_seqs; + if (ith >= total_rows) { + return; + } + + const uint32_t rq3 = n_seqs / q->ne[3]; + const uint32_t rk3 = n_seqs / k->ne[3]; + const float scale = 1.0f / sqrtf((float) S_v); + + float * dst_base = (float *) (uintptr_t) dst->data; + float * state_out_base = dst_base + (uint64_t) S_v * H * n_seqs; + const float * state_in_base = (const float *) (uintptr_t) state->data; + + const bool kda = (g->ne[0] == S_v); + float local_gate[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_q[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128))); + float local_sums[8] __attribute__((aligned(128))); + + dma_queue * dma = octx->ctx->dma[ith]; + + uint8_t * spad = NULL; + if (gctx->use_vtcm) { + spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith; + } + + for (uint32_t ir = ith; ir < total_rows; ir += nth) { + const uint32_t iv1 = ir % H; + const uint32_t iv3 = ir / H; + + const uint32_t iq1 = iv1 % q->ne[1]; + const uint32_t ik1 = iv1 % k->ne[1]; + const uint32_t iq3 = iv3 / rq3; + const uint32_t ik3 = iv3 / rk3; + + float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + float * s_work; + + if (spad) { + dma_queue_push(dma, dma_make_ptr(spad, s_in), + S_v * sizeof(float), S_v * sizeof(float), + S_v * sizeof(float), S_v); + dma_queue_pop(dma); + s_work = (float *) spad; + } else { + s_work = s_out; + memcpy(s_work, s_in, gctx->state_bytes); + } + + float * attn_data = dst_base + ((uint64_t) iv3 * H + iv1) * S_v; + + const float * q_t = (const float *) ((const uint8_t *) (uintptr_t) q->data + + (uint64_t) iq3 * q->nb[3] + (uint64_t) iq1 * q->nb[1]); + const float * k_t = (const float *) ((const uint8_t *) (uintptr_t) k->data + + (uint64_t) ik3 * k->nb[3] + (uint64_t) ik1 * k->nb[1]); + const float * v_t = (const float *) ((const uint8_t *) (uintptr_t) v->data + + (uint64_t) iv3 * v->nb[3] + (uint64_t) iv1 * v->nb[1]); + const float * g_t = (const float *) ((const uint8_t *) (uintptr_t) g->data + + (uint64_t) iv3 * g->nb[3] + (uint64_t) iv1 * g->nb[1]); + const float beta_val = *(const float *) ((const uint8_t *) (uintptr_t) beta->data + + (uint64_t) iv3 * beta->nb[3] + (uint64_t) iv1 * beta->nb[1]); + + memcpy(local_q, q_t, (size_t) S_v * sizeof(float)); + memcpy(local_k, k_t, (size_t) S_v * sizeof(float)); + + if (kda) { + hvx_exp_f32((uint8_t *) local_gate, (const uint8_t *) g_t, S_v, false); + + uint32_t j = 0; + for (; j + 8 <= S_v; j += 8) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + float * row4 = s_work + (uint64_t) (j + 4) * S_v; + float * row5 = s_work + (uint64_t) (j + 5) * S_v; + float * row6 = s_work + (uint64_t) (j + 6) * S_v; + float * row7 = s_work + (uint64_t) (j + 7) * S_v; + gdn_mul_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7, + local_gate, local_k, S_v, local_sums); + float local_delta_b[8] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 8; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7, + local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 8; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j + 4 <= S_v; j += 4) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + gdn_mul_dot4_f32(row0, row1, row2, row3, local_gate, local_k, S_v, local_sums); + float local_delta_b[4] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 4; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 4; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j < S_v; ++j) { + float * row = s_work + (uint64_t) j * S_v; + const float sum = gdn_mul_dot_f32(row, local_gate, local_k, S_v); + const float dj = (v_t[j] - sum) * beta_val; + attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale; + } + } else { + const float gate = expf(g_t[0]); + uint32_t j = 0; + for (; j + 8 <= S_v; j += 8) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + float * row4 = s_work + (uint64_t) (j + 4) * S_v; + float * row5 = s_work + (uint64_t) (j + 5) * S_v; + float * row6 = s_work + (uint64_t) (j + 6) * S_v; + float * row7 = s_work + (uint64_t) (j + 7) * S_v; + gdn_mul_scalar_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7, + gate, local_k, S_v, local_sums); + float local_delta_b[8] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 8; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot8_f32(row0, row1, row2, row3, row4, row5, row6, row7, + local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 8; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j + 4 <= S_v; j += 4) { + float * row0 = s_work + (uint64_t) (j + 0) * S_v; + float * row1 = s_work + (uint64_t) (j + 1) * S_v; + float * row2 = s_work + (uint64_t) (j + 2) * S_v; + float * row3 = s_work + (uint64_t) (j + 3) * S_v; + gdn_mul_scalar_dot4_f32(row0, row1, row2, row3, gate, local_k, S_v, local_sums); + float local_delta_b[4] __attribute__((aligned(128))); + for (uint32_t r = 0; r < 4; ++r) { + local_delta_b[r] = (v_t[j + r] - local_sums[r]) * beta_val; + } + gdn_add_scaled_dot4_f32(row0, row1, row2, row3, local_k, local_delta_b, local_q, S_v, local_sums); + for (uint32_t r = 0; r < 4; ++r) { + attn_data[j + r] = local_sums[r] * scale; + } + } + for (; j < S_v; ++j) { + float * row = s_work + (uint64_t) j * S_v; + const float sum = gdn_mul_scalar_dot_f32(row, gate, local_k, S_v); + const float dj = (v_t[j] - sum) * beta_val; + attn_data[j] = gdn_add_scaled_dot_f32(row, local_k, dj, local_q, S_v) * scale; + } + } + + if (spad) { + dma_queue_push(dma, dma_make_ptr(s_out, spad), + S_v * sizeof(float), S_v * sizeof(float), + S_v * sizeof(float), S_v); + dma_queue_pop(dma); + } + } +} + +int op_gated_delta_net(struct htp_ops_context * octx) { + const struct htp_tensor * q = octx->src[0]; + const struct htp_tensor * k = octx->src[1]; + const struct htp_tensor * v = octx->src[2]; + const struct htp_tensor * g = octx->src[3]; + const struct htp_tensor * beta = octx->src[4]; + const struct htp_tensor * state = octx->src[5]; + const struct htp_tensor * dst = octx->dst; + + if (!q || !k || !v || !g || !beta || !state || !dst) { + return HTP_STATUS_INVAL_PARAMS; + } + + if (q->type != HTP_TYPE_F32 || k->type != HTP_TYPE_F32 || v->type != HTP_TYPE_F32 || + g->type != HTP_TYPE_F32 || beta->type != HTP_TYPE_F32 || state->type != HTP_TYPE_F32 || + dst->type != HTP_TYPE_F32) { + return HTP_STATUS_NO_SUPPORT; + } + + const uint32_t S_v = v->ne[0]; + const uint32_t H = v->ne[1]; + const uint32_t n_tokens = v->ne[2]; + const uint32_t n_seqs = v->ne[3]; + + if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) { + return HTP_STATUS_NO_SUPPORT; + } + if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) { + return HTP_STATUS_NO_SUPPORT; + } + if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] == 0 || k->ne[1] == 0 || + q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] == 0 || k->ne[3] == 0 || + (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) { + return HTP_STATUS_NO_SUPPORT; + } + if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) { + return HTP_STATUS_NO_SUPPORT; + } + if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) { + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + struct htp_gdn_context gctx; + gctx.octx = octx; + gctx.rows_per_thread = (H * n_seqs + octx->n_threads - 1) / octx->n_threads; + gctx.state_bytes = (size_t) S_v * S_v * sizeof(float); + + size_t state_aligned = (size_t) S_v * S_v * sizeof(float); + state_aligned = (state_aligned + 127) & ~(size_t)127; + + gctx.use_vtcm = false; + gctx.vtcm_state_base = NULL; + gctx.vtcm_state_per_thread = 0; + + if (n_tokens == 1 && octx->ctx->vtcm_base) { + size_t vtcm_total = state_aligned * octx->n_threads; + if (octx->ctx->vtcm_size >= vtcm_total) { + gctx.use_vtcm = true; + gctx.vtcm_state_base = octx->ctx->vtcm_base; + gctx.vtcm_state_per_thread = state_aligned; + } + } + + if (n_tokens == 1) { + worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_tg_thread, &gctx, octx->n_threads); + } else { + worker_pool_run_func(octx->ctx->worker_pool, gated_delta_net_f32_pp_thread, &gctx, octx->n_threads); + } + + return HTP_STATUS_OK; +} diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index e9c563ca8..92f02eac6 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -106,5 +106,6 @@ int op_cumsum(struct htp_ops_context * octx); int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); +int op_gated_delta_net(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index ef96ad382..6203e3848 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -84,6 +84,7 @@ enum htp_op_code { HTP_OP_DIAG, HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, + HTP_OP_GATED_DELTA_NET, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index e18f1a0e6..fa1e0698f 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -594,6 +594,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_SOLVE_TRI: return op_solve_tri(octx); + case HTP_OP_GATED_DELTA_NET: + return op_gated_delta_net(octx); + case HTP_OP_INVALID: break;