hexagon: enable non-contiguous row tensor support for unary ops (llama/22574)

This commit is contained in:
Aparna M P 2026-05-01 22:39:23 +05:30 committed by Georgi Gerganov
parent 9623c1203b
commit f2ce24fa5c
3 changed files with 85 additions and 33 deletions

View File

@ -2421,8 +2421,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
return false;
}
// TODO: add support for non-contigiuos tensors
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
// TODO: add support for non-contiguous elements within a row
if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
return false;
}

View File

@ -17,7 +17,7 @@
#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805
#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408
#define EXP_ONE (0x3f800000) // 1.0
#define EXP_RANGE_R (0x42B16666) // 88.7
#define EXP_RANGE_R (0x42B17218) // ln(FLT_MAX) approx = 88.7228
#define EXP_RANGE_L (0xC2B00000) // -88.0 (approx log(FLT_MIN))
static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
@ -163,7 +163,7 @@ static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict
HVX_Vector vec_out = Q6_V_vzero();
static const float kInf = INFINITY;
static const float kMaxExp = 88.7f;
static const float kMaxExp = 88.7228f;
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
const HVX_Vector inf = hvx_vec_splat_f32(kInf);

View File

@ -26,8 +26,8 @@ struct htp_unary_context {
const uint8_t * data_src0;
uint8_t * data_dst;
size_t src0_row_size;
size_t dst_row_size;
size_t src0_data_row_size; // actual data bytes per row
size_t dst_data_row_size; // actual data bytes per row
size_t src0_row_size_aligned;
size_t dst_row_size_aligned;
@ -41,6 +41,40 @@ struct htp_unary_context {
uint32_t nc;
};
// Convert flat row index to DDR byte offset using the tensor's actual strides.
// ir = i1 + ne1*(i2 + ne2*i3) => offset = i1*nb1 + i2*nb2 + i3*nb3
static inline size_t unary_row_offset(uint32_t ir,
uint32_t ne1, uint32_t ne2,
size_t nb1, size_t nb2, size_t nb3) {
const uint32_t i1 = ir % ne1;
const uint32_t i2 = (ir / ne1) % ne2;
const uint32_t i3 = ir / (ne1 * ne2);
return i1 * nb1 + i2 * nb2 + i3 * nb3;
}
// Safe DMA block size from row `ir`: clamp to the tighter dim-1 slice
// boundary of src and dst so the nb1 stride stays valid for all rows.
static inline uint32_t unary_block_size(uint32_t ir,
uint32_t end_row,
uint32_t block,
bool src_contig,
bool dst_contig,
uint32_t src_ne1,
uint32_t dst_ne1) {
uint32_t limit = MIN(block, end_row - ir);
if (!src_contig) {
const uint32_t src_slice_end = (ir / src_ne1 + 1) * src_ne1;
limit = MIN(limit, src_slice_end - ir);
}
if (!dst_contig) {
const uint32_t dst_slice_end = (ir / dst_ne1 + 1) * dst_ne1;
limit = MIN(limit, dst_slice_end - ir);
}
return limit;
}
#define htp_unary_preamble \
const uint32_t ne00 = src->ne[0]; \
const uint32_t ne01 = src->ne[1]; \
@ -276,8 +310,8 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
int32_t * op_params = octx->op_params;
uint32_t src0_nrows_per_thread = uctx->src0_nrows_per_thread;
const size_t src0_row_size = uctx->src0_row_size;
const size_t dst_row_size = uctx->dst_row_size;
const size_t src0_data_row_size = uctx->src0_data_row_size;
const size_t dst_data_row_size = uctx->dst_data_row_size;
const size_t src0_row_size_aligned = uctx->src0_row_size_aligned;
const size_t dst_row_size_aligned = uctx->dst_row_size_aligned;
@ -303,7 +337,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
size_t src0_spad_half_size = uctx->src0_spad_half_size;
size_t dst_spad_half_size = uctx->dst_spad_half_size;
const int BLOCK = uctx->block;
// Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
// 2D DMA descriptor cannot span. Clamp BLOCK to ne1 (one dim-1 slice) so every
// transfer stays within a nb1-uniform region. Skipped for contiguous tensors.
const bool src0_contig = (nb02 == (size_t)ne01 * nb01) &&
(nb03 == (size_t)ne02 * nb02);
const bool dst_contig = (nb2 == (size_t)ne1 * nb1) &&
(nb3 == (size_t)ne2 * nb2);
const uint32_t src0_max_block = src0_contig ? uctx->block : MIN((uint32_t)uctx->block, ne01);
const uint32_t dst_max_block = dst_contig ? uctx->block : MIN((uint32_t)uctx->block, ne1);
const uint32_t BLOCK = MIN(src0_max_block, dst_max_block);
if (BLOCK == 0) {
FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
octx->src0_spad.size_per_thread, src0_row_size_aligned);
@ -312,21 +355,23 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
dma_queue * dma_queue = octx->ctx->dma[ith];
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_queue_push(dma_queue,
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
nb1, dst_row_size_aligned, dst_data_row_size, 0);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
const size_t src0_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
dma_queue_push(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
src0_row_size_aligned, nb01, src0_data_row_size, block_size);
ir += block_size;
}
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
for (uint32_t ir = src0_start_row; ir < src0_end_row; ) {
const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
@ -361,18 +406,25 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
break;
}
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
dst_row_size, dst_row_size_aligned, block_size);
const size_t dst_off = unary_row_offset(ir, ne1, ne2, nb1, nb2, nb3);
dma_queue_push(dma_queue,
dma_make_ptr(data_dst + dst_off, dst_spad),
nb1, dst_row_size_aligned, dst_data_row_size, block_size);
// prefetch N+2 loop iteration if any
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
const uint32_t next_ir = ir + block_size;
if (next_ir < src0_end_row) {
const uint32_t next_block_size = unary_block_size(next_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
const uint32_t pref_ir = next_ir + next_block_size;
if (pref_ir < src0_end_row) {
const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
dma_queue_push(dma_queue,
dma_make_ptr(src0_spad, data_src + src0_pref_off),
src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
}
}
ir += block_size;
}
dma_queue_flush(dma_queue);
@ -426,11 +478,11 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const uint32_t n_threads = MIN(octx->n_threads, src0_nrows);
const size_t src0_row_size = src0->nb[1];
const size_t dst_row_size = dst->nb[1];
const size_t src0_data_row_size = src0->ne[0] * sizeof(float);
const size_t dst_data_row_size = dst->ne[0] * sizeof(float);
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN);
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
@ -468,8 +520,8 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
.data_src0 = (const uint8_t *)src0->data,
.data_dst = (uint8_t *)dst->data,
.src0_row_size = src0_row_size,
.dst_row_size = dst_row_size,
.src0_data_row_size = src0_data_row_size,
.dst_data_row_size = dst_data_row_size,
.src0_row_size_aligned = src0_row_size_aligned,
.dst_row_size_aligned = dst_row_size_aligned,