From f2ce24fa5c946d7cbc42f52947428c2eba299393 Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Fri, 1 May 2026 22:39:23 +0530 Subject: [PATCH] hexagon: enable non-contiguous row tensor support for unary ops (llama/22574) --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 4 +- ggml/src/ggml-hexagon/htp/hvx-exp.h | 4 +- ggml/src/ggml-hexagon/htp/unary-ops.c | 110 ++++++++++++++++++------- 3 files changed, 85 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 17ac083f..6bb07310 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2421,8 +2421,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses return false; } - // TODO: add support for non-contigiuos tensors - if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { + // TODO: add support for non-contiguous elements within a row + if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) { return false; } diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.h b/ggml/src/ggml-hexagon/htp/hvx-exp.h index 84e4836d..e71ec490 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.h +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.h @@ -17,7 +17,7 @@ #define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805 #define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408 #define EXP_ONE (0x3f800000) // 1.0 -#define EXP_RANGE_R (0x42B16666) // 88.7 +#define EXP_RANGE_R (0x42B17218) // ln(FLT_MAX) approx = 88.7228 #define EXP_RANGE_L (0xC2B00000) // -88.0 (approx log(FLT_MIN)) static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) { @@ -163,7 +163,7 @@ static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict HVX_Vector vec_out = Q6_V_vzero(); static const float kInf = INFINITY; - static const float kMaxExp = 88.7f; + static const float kMaxExp = 88.7228f; const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp); const HVX_Vector inf = hvx_vec_splat_f32(kInf); diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 03eccfd5..819cdc49 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -26,8 +26,8 @@ struct htp_unary_context { const uint8_t * data_src0; uint8_t * data_dst; - size_t src0_row_size; - size_t dst_row_size; + size_t src0_data_row_size; // actual data bytes per row + size_t dst_data_row_size; // actual data bytes per row size_t src0_row_size_aligned; size_t dst_row_size_aligned; @@ -41,6 +41,40 @@ struct htp_unary_context { uint32_t nc; }; +// Convert flat row index to DDR byte offset using the tensor's actual strides. +// ir = i1 + ne1*(i2 + ne2*i3) => offset = i1*nb1 + i2*nb2 + i3*nb3 +static inline size_t unary_row_offset(uint32_t ir, + uint32_t ne1, uint32_t ne2, + size_t nb1, size_t nb2, size_t nb3) { + const uint32_t i1 = ir % ne1; + const uint32_t i2 = (ir / ne1) % ne2; + const uint32_t i3 = ir / (ne1 * ne2); + return i1 * nb1 + i2 * nb2 + i3 * nb3; +} +// Safe DMA block size from row `ir`: clamp to the tighter dim-1 slice +// boundary of src and dst so the nb1 stride stays valid for all rows. +static inline uint32_t unary_block_size(uint32_t ir, + uint32_t end_row, + uint32_t block, + bool src_contig, + bool dst_contig, + uint32_t src_ne1, + uint32_t dst_ne1) { + uint32_t limit = MIN(block, end_row - ir); + + if (!src_contig) { + const uint32_t src_slice_end = (ir / src_ne1 + 1) * src_ne1; + limit = MIN(limit, src_slice_end - ir); + } + + if (!dst_contig) { + const uint32_t dst_slice_end = (ir / dst_ne1 + 1) * dst_ne1; + limit = MIN(limit, dst_slice_end - ir); + } + + return limit; +} + #define htp_unary_preamble \ const uint32_t ne00 = src->ne[0]; \ const uint32_t ne01 = src->ne[1]; \ @@ -276,8 +310,8 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * int32_t * op_params = octx->op_params; uint32_t src0_nrows_per_thread = uctx->src0_nrows_per_thread; - const size_t src0_row_size = uctx->src0_row_size; - const size_t dst_row_size = uctx->dst_row_size; + const size_t src0_data_row_size = uctx->src0_data_row_size; + const size_t dst_data_row_size = uctx->dst_data_row_size; const size_t src0_row_size_aligned = uctx->src0_row_size_aligned; const size_t dst_row_size_aligned = uctx->dst_row_size_aligned; @@ -303,7 +337,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * size_t src0_spad_half_size = uctx->src0_spad_half_size; size_t dst_spad_half_size = uctx->dst_spad_half_size; - const int BLOCK = uctx->block; + // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride + // 2D DMA descriptor cannot span. Clamp BLOCK to ne1 (one dim-1 slice) so every + // transfer stays within a nb1-uniform region. Skipped for contiguous tensors. + const bool src0_contig = (nb02 == (size_t)ne01 * nb01) && + (nb03 == (size_t)ne02 * nb02); + const bool dst_contig = (nb2 == (size_t)ne1 * nb1) && + (nb3 == (size_t)ne2 * nb2); + const uint32_t src0_max_block = src0_contig ? uctx->block : MIN((uint32_t)uctx->block, ne01); + const uint32_t dst_max_block = dst_contig ? uctx->block : MIN((uint32_t)uctx->block, ne1); + const uint32_t BLOCK = MIN(src0_max_block, dst_max_block); if (BLOCK == 0) { FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", octx->src0_spad.size_per_thread, src0_row_size_aligned); @@ -312,21 +355,23 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * dma_queue * dma_queue = octx->ctx->dma[ith]; - for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) { - const uint32_t block_size = MIN(BLOCK, src0_end_row - ir); + for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) { + const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); // Dummy DMA transation for sequencing (interleaving dst,src,dst,...) - dma_queue_push_vtcm_to_ddr(dma_queue, + dma_queue_push(dma_queue, dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)), - dst_row_size, dst_row_size_aligned, 0); + nb1, dst_row_size_aligned, dst_data_row_size, 0); - dma_queue_push_ddr_to_vtcm(dma_queue, - dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)), - src0_row_size_aligned, src0_row_size, block_size); + const size_t src0_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off), + src0_row_size_aligned, nb01, src0_data_row_size, block_size); + ir += block_size; } - for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { - const uint32_t block_size = MIN(BLOCK, src0_end_row - ir); + for (uint32_t ir = src0_start_row; ir < src0_end_row; ) { + const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); float * dst_spad = (float *) dma_queue_pop(dma_queue).src; float * src0_spad = (float *) dma_queue_pop(dma_queue).dst; @@ -361,18 +406,25 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * break; } - dma_queue_push_vtcm_to_ddr(dma_queue, - dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad), - dst_row_size, dst_row_size_aligned, block_size); + const size_t dst_off = unary_row_offset(ir, ne1, ne2, nb1, nb2, nb3); + dma_queue_push(dma_queue, + dma_make_ptr(data_dst + dst_off, dst_spad), + nb1, dst_row_size_aligned, dst_data_row_size, block_size); // prefetch N+2 loop iteration if any - const uint32_t pref_block = (ir + BLOCK * 2); - if (pref_block < src0_end_row) { - const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block); - dma_queue_push_ddr_to_vtcm(dma_queue, - dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)), - src0_row_size_aligned, src0_row_size, pref_block_size); + const uint32_t next_ir = ir + block_size; + if (next_ir < src0_end_row) { + const uint32_t next_block_size = unary_block_size(next_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); + const uint32_t pref_ir = next_ir + next_block_size; + if (pref_ir < src0_end_row) { + const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); + const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src0_spad, data_src + src0_pref_off), + src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size); + } } + ir += block_size; } dma_queue_flush(dma_queue); @@ -426,11 +478,11 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const uint32_t n_threads = MIN(octx->n_threads, src0_nrows); - const size_t src0_row_size = src0->nb[1]; - const size_t dst_row_size = dst->nb[1]; + const size_t src0_data_row_size = src0->ne[0] * sizeof(float); + const size_t dst_data_row_size = dst->ne[0] * sizeof(float); - const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); - const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN); // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size @@ -468,8 +520,8 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { .data_src0 = (const uint8_t *)src0->data, .data_dst = (uint8_t *)dst->data, - .src0_row_size = src0_row_size, - .dst_row_size = dst_row_size, + .src0_data_row_size = src0_data_row_size, + .dst_data_row_size = dst_data_row_size, .src0_row_size_aligned = src0_row_size_aligned, .dst_row_size_aligned = dst_row_size_aligned,