hexagon: enable non-contiguous row tensor support for unary ops (llama/22574)

2026-05-01 22:39:23 +05:30 · 2026-05-01 22:39:23 +05:30 · f2ce24fa5c
parent 9623c1203b
commit f2ce24fa5c
3 changed files with 85 additions and 33 deletions
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@ -2421,8 +2421,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
        return false;
    }

-    // TODO: add support for non-contigiuos tensors
-    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+    // TODO: add support for non-contiguous elements within a row
+    if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
        return false;
    }

--- a/ggml/src/ggml-hexagon/htp/hvx-exp.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.h
@ -17,7 +17,7 @@
 #define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
 #define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
 #define EXP_ONE     (0x3f800000)  // 1.0
-#define EXP_RANGE_R (0x42B16666)  // 88.7
+#define EXP_RANGE_R (0x42B17218)  // ln(FLT_MAX) approx = 88.7228
 #define EXP_RANGE_L (0xC2B00000)  // -88.0 (approx log(FLT_MIN))

 static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
@ -163,7 +163,7 @@ static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict
    HVX_Vector vec_out = Q6_V_vzero();

    static const float kInf    = INFINITY;
-    static const float kMaxExp = 88.7f;
+    static const float kMaxExp = 88.7228f;

    const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
    const HVX_Vector inf     = hvx_vec_splat_f32(kInf);
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@ -26,8 +26,8 @@ struct htp_unary_context {
    const uint8_t *           data_src0;
    uint8_t *                 data_dst;

-    size_t                    src0_row_size;
-    size_t                    dst_row_size;
+    size_t                    src0_data_row_size;   // actual data bytes per row
+    size_t                    dst_data_row_size;    // actual data bytes per row

    size_t                    src0_row_size_aligned;
    size_t                    dst_row_size_aligned;
@ -41,6 +41,40 @@ struct htp_unary_context {
    uint32_t                  nc;
 };

+// Convert flat row index to DDR byte offset using the tensor's actual strides.
+// ir = i1 + ne1*(i2 + ne2*i3)  =>  offset = i1*nb1 + i2*nb2 + i3*nb3
+static inline size_t unary_row_offset(uint32_t ir,
+                                      uint32_t ne1, uint32_t ne2,
+                                      size_t nb1, size_t nb2, size_t nb3) {
+    const uint32_t i1 = ir % ne1;
+    const uint32_t i2 = (ir / ne1) % ne2;
+    const uint32_t i3 = ir / (ne1 * ne2);
+    return i1 * nb1 + i2 * nb2 + i3 * nb3;
+}
+// Safe DMA block size from row `ir`: clamp to the tighter dim-1 slice
+// boundary of src and dst so the nb1 stride stays valid for all rows.
+static inline uint32_t unary_block_size(uint32_t ir,
+                                        uint32_t end_row,
+                                        uint32_t block,
+                                        bool src_contig,
+                                        bool dst_contig,
+                                        uint32_t src_ne1,
+                                        uint32_t dst_ne1) {
+    uint32_t limit = MIN(block, end_row - ir);
+
+    if (!src_contig) {
+        const uint32_t src_slice_end = (ir / src_ne1 + 1) * src_ne1;
+        limit = MIN(limit, src_slice_end - ir);
+    }
+
+    if (!dst_contig) {
+        const uint32_t dst_slice_end = (ir / dst_ne1 + 1) * dst_ne1;
+        limit = MIN(limit, dst_slice_end - ir);
+    }
+
+    return limit;
+}
+
 #define htp_unary_preamble            \
    const uint32_t ne00 = src->ne[0]; \
    const uint32_t ne01 = src->ne[1]; \
@ -276,8 +310,8 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
    int32_t *                 op_params = octx->op_params;
    uint32_t                  src0_nrows_per_thread = uctx->src0_nrows_per_thread;

-    const size_t src0_row_size = uctx->src0_row_size;
-    const size_t dst_row_size  = uctx->dst_row_size;
+    const size_t src0_data_row_size = uctx->src0_data_row_size;
+    const size_t dst_data_row_size  = uctx->dst_data_row_size;

    const size_t src0_row_size_aligned = uctx->src0_row_size_aligned;
    const size_t dst_row_size_aligned  = uctx->dst_row_size_aligned;
@ -303,7 +337,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
    size_t src0_spad_half_size = uctx->src0_spad_half_size;
    size_t dst_spad_half_size  = uctx->dst_spad_half_size;

-    const int BLOCK = uctx->block;
+    // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
+    // 2D DMA descriptor cannot span. Clamp BLOCK to ne1 (one dim-1 slice) so every
+    // transfer stays within a nb1-uniform region. Skipped for contiguous tensors.
+    const bool src0_contig = (nb02 == (size_t)ne01 * nb01) &&
+                             (nb03 == (size_t)ne02 * nb02);
+    const bool dst_contig  = (nb2  == (size_t)ne1  * nb1)  &&
+                             (nb3  == (size_t)ne2  * nb2);
+    const uint32_t src0_max_block = src0_contig ? uctx->block : MIN((uint32_t)uctx->block, ne01);
+    const uint32_t dst_max_block  = dst_contig  ? uctx->block : MIN((uint32_t)uctx->block, ne1);
+    const uint32_t BLOCK = MIN(src0_max_block, dst_max_block);
    if (BLOCK == 0) {
        FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
             octx->src0_spad.size_per_thread, src0_row_size_aligned);
@ -312,21 +355,23 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *

    dma_queue * dma_queue = octx->ctx->dma[ith];

-    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
-        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
+        const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);

        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
-        dma_queue_push_vtcm_to_ddr(dma_queue,
+        dma_queue_push(dma_queue,
            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
-            dst_row_size, dst_row_size_aligned, 0);
+            nb1, dst_row_size_aligned, dst_data_row_size, 0);

-        dma_queue_push_ddr_to_vtcm(dma_queue,
-            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)),
-            src0_row_size_aligned, src0_row_size, block_size);
+        const size_t src0_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
+        dma_queue_push(dma_queue,
+            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
+            src0_row_size_aligned, nb01, src0_data_row_size, block_size);
+        ir += block_size;
    }

-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
-        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ) {
+        const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);

        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
@ -361,18 +406,25 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
                break;
        }

-        dma_queue_push_vtcm_to_ddr(dma_queue,
-            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
-            dst_row_size, dst_row_size_aligned, block_size);
+        const size_t dst_off = unary_row_offset(ir, ne1, ne2, nb1, nb2, nb3);
+        dma_queue_push(dma_queue,
+            dma_make_ptr(data_dst + dst_off, dst_spad),
+            nb1, dst_row_size_aligned, dst_data_row_size, block_size);

        // prefetch N+2 loop iteration if any
-        const uint32_t pref_block = (ir + BLOCK * 2);
-        if (pref_block < src0_end_row) {
-            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
-            dma_queue_push_ddr_to_vtcm(dma_queue,
-                dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)),
-                src0_row_size_aligned, src0_row_size, pref_block_size);
+        const uint32_t next_ir = ir + block_size;
+        if (next_ir < src0_end_row) {
+            const uint32_t next_block_size = unary_block_size(next_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
+            const uint32_t pref_ir = next_ir + next_block_size;
+            if (pref_ir < src0_end_row) {
+                const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
+                const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
+            dma_queue_push(dma_queue,
+                dma_make_ptr(src0_spad, data_src + src0_pref_off),
+                src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
+            }
        }
+        ir += block_size;
    }

    dma_queue_flush(dma_queue);
@ -426,11 +478,11 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    const uint32_t n_threads  = MIN(octx->n_threads, src0_nrows);

-    const size_t src0_row_size = src0->nb[1];
-    const size_t dst_row_size  = dst->nb[1];
+    const size_t src0_data_row_size = src0->ne[0] * sizeof(float);
+    const size_t dst_data_row_size  = dst->ne[0]  * sizeof(float);

-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_data_row_size,  VLEN);

    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
@ -468,8 +520,8 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
            .data_src0             = (const uint8_t *)src0->data,
            .data_dst              = (uint8_t *)dst->data,

-            .src0_row_size         = src0_row_size,
-            .dst_row_size          = dst_row_size,
+            .src0_data_row_size    = src0_data_row_size,
+            .dst_data_row_size     = dst_data_row_size,

            .src0_row_size_aligned = src0_row_size_aligned,
            .dst_row_size_aligned  = dst_row_size_aligned,