From 5cbc2153c253e32103be889c0d82d361fb78d819 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 18 Jun 2026 08:35:02 -0700 Subject: [PATCH] hexagon: support for op-trace (fine-grain tracing of HVX/HMX/DMA events) (llama/24592) * hex-optrace: add support for optrace and instrument matmul and flash-atten code * hex-trace: improve trace event and prefetto generator * hex-trace: add new script dedicated to handling traces, specifically perfetto traces * hex-trace: add --head/--tail options to profile and trace tools * hex-trace: fix whitespaces * hex-trace: fix flake8 warnings * hex-trace: fix flake8 warnings * hmx-fa: restore q_tiles clearing * hex-profile: remove circular dep in includes * hex-trace: simplify trace sizing check * hex-profile: sort events in the summary by name --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 116 ++++++++++++++++-- ggml/src/ggml-hexagon/htp/CMakeLists.txt | 2 +- ggml/src/ggml-hexagon/htp/flash-attn-ops.c | 4 + ggml/src/ggml-hexagon/htp/hex-dma.h | 13 +- ggml/src/ggml-hexagon/htp/hex-profile.h | 64 ++++++++++ ggml/src/ggml-hexagon/htp/hex-utils.h | 27 ---- .../src/ggml-hexagon/htp/hmx-flash-attn-ops.c | 92 ++++---------- ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 96 ++++++++------- ggml/src/ggml-hexagon/htp/hmx-profile.h | 34 ----- ggml/src/ggml-hexagon/htp/hmx-queue.c | 2 + ggml/src/ggml-hexagon/htp/hmx-queue.h | 2 + ggml/src/ggml-hexagon/htp/htp-ctx.h | 2 + ggml/src/ggml-hexagon/htp/htp-ops.h | 35 +++++- ggml/src/ggml-hexagon/htp/main.c | 50 ++++++-- ggml/src/ggml-hexagon/htp/matmul-ops.c | 46 +++++++ 15 files changed, 388 insertions(+), 197 deletions(-) create mode 100644 ggml/src/ggml-hexagon/htp/hex-profile.h delete mode 100644 ggml/src/ggml-hexagon/htp/hmx-profile.h diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 49bd7e433..e612ec392 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -69,6 +69,7 @@ static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE; static int opt_opbatch = 1024; // max number of ops in a batch static int opt_opqueue = 16; // max number of pending batches static int opt_oppoll = 0; // polling for batch completions +static int opt_optrace = 0; // trace buffer size per thread (0 means default) static std::regex* opt_opfilter = NULL; // regex of ops to not claim @@ -118,20 +119,39 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no"); } +static const char * htp_event_name(uint16_t id) { + switch (id) { + case HTP_TRACE_EVT_DMA: return "DMA"; + case HTP_TRACE_EVT_HVX_COMP: return "HVX_COMP"; + case HTP_TRACE_EVT_HVX_A_QUANT: return "HVX_A_QUANT"; + case HTP_TRACE_EVT_HVX_A_PREP: return "HVX_A_PREP"; + case HTP_TRACE_EVT_HVX_W_DEQUANT: return "HVX_W_DEQUANT"; + case HTP_TRACE_EVT_HVX_W_PREP: return "HVX_W_PREP"; + case HTP_TRACE_EVT_HVX_O_PROC: return "HVX_O_PROC"; + case HTP_TRACE_EVT_HMX_COMP: return "HMX_COMP"; + default: return "UNKNOWN"; + } +} + static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node, - uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) { + const htp_prof_desc & pd) { if (!opt_profile) return; + uint32_t op_usec = pd.usecs; + uint32_t op_cycles = pd.cycles_stop - pd.cycles_start; + const uint32_t * pmu = pd.pmu; + char pmu_str[256] = ""; - if (opt_profile > 1) { + if (opt_profile == 2) { static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters"); sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]", pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]); } htp_opformat fmt(node); - GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(), - node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str); + float mhz = op_usec > 0 ? (float) op_cycles / op_usec : 0.0f; + GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u start %u mhz %.1f%s\n", sess_name.c_str(), + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pd.cycles_start, mhz, pmu_str); } // ** backend sessions @@ -1995,10 +2015,16 @@ struct ggml_hexagon_opqueue { size_t n_ops = batch_size; size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS; + size_t tr_size = 0; + if (opt_profile == 3) { + tr_size = (HTP_MAX_NTHREADS + 1) * opt_optrace * sizeof(htp_trace_desc); + } + shm_blk_size = sizeof(htp_buf_desc) * n_bufs + sizeof(htp_tensor) * n_tensors + sizeof(htp_op_desc) * n_ops + - sizeof(htp_prof_desc) * n_ops; + sizeof(htp_prof_desc) * n_ops + + tr_size; shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */); @@ -2042,11 +2068,19 @@ struct ggml_hexagon_opqueue { const size_t o_size = sizeof(htp_op_desc) * req.n_ops; const size_t p_size = sizeof(htp_prof_desc) * req.n_ops; + size_t tr_size = 0; + if (opt_profile == 3) { + req.n_traces = opt_optrace; + tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(htp_trace_desc); + } else { + req.n_traces = 0; + } + dbuf.ptr = shm_buf->base + (req.id * shm_blk_size); dbuf.fd = shm_buf->fd; dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base; - dbuf.size = b_size + t_size + o_size + p_size; + dbuf.size = b_size + t_size + o_size + p_size + tr_size; GGML_ASSERT(dbuf.size <= shm_blk_size); @@ -2092,7 +2126,14 @@ struct ggml_hexagon_opqueue { const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops; const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops; - const size_t m_size = b_size + t_size + o_size + p_size; + size_t tr_size = 0; + uint32_t n_traces = 0; + if (opt_profile == 3) { + n_traces = opt_optrace; + tr_size = (HTP_MAX_NTHREADS + 1) * n_traces * sizeof(htp_trace_desc); + } + + const size_t m_size = b_size + t_size + o_size + p_size + tr_size; GGML_ASSERT(m_size <= shm_blk_size); HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n", @@ -2111,13 +2152,62 @@ struct ggml_hexagon_opqueue { GGML_ASSERT(rsp.n_ops <= ops.size()); const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr; - for (uint32_t i = 0; i < rsp.n_ops; i++) { - htp_usec += pd[i].usecs; - ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu); + + const htp_trace_desc * trace_events = nullptr; + + if (opt_profile == 3) { + trace_events = (const htp_trace_desc *) (p_ptr + p_size); } - GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n", - shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec); + uint32_t trace_idx[HTP_MAX_NTHREADS + 1] = {0}; + uint32_t valid_cnt[HTP_MAX_NTHREADS + 1] = {0}; + + if (opt_profile == 3) { + for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) { + uint32_t count = rsp.n_traces[t]; + valid_cnt[t] = count > n_traces ? n_traces : count; + } + } + + for (uint32_t i = 0; i < rsp.n_ops; i++) { + htp_usec += pd[i].usecs; + + ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i]); + + if (opt_profile == 3) { + uint32_t op_duration = pd[i].cycles_stop - pd[i].cycles_start; + + for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) { + while (trace_idx[t] < valid_cnt[t]) { + const auto & e = trace_events[t * n_traces + trace_idx[t]]; + uint32_t offset = e.cycles - pd[i].cycles_start; + if (offset >= 0x80000000) { + trace_idx[t]++; + continue; + } + if (offset > op_duration) { + break; + } + bool is_stop = (e.info & 0x8000) != 0; + uint16_t info = e.info & 0x7FFF; + GGML_LOG_DEBUG("ggml-hex: %s trace-op %s: thread %u event %s info %u %s %u\n", + shm_buf->sess->c_name(), ops[i].op_name().c_str(), t, htp_event_name(e.id), info, is_stop ? "stop" : "start", e.cycles); + trace_idx[t]++; + } + } + } + } + + char evt_str[256] = ""; + if (opt_profile == 3) { + sprintf(evt_str, " evt [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u]", + rsp.n_traces[0], rsp.n_traces[1], rsp.n_traces[2], rsp.n_traces[3], + rsp.n_traces[4], rsp.n_traces[5], rsp.n_traces[6], rsp.n_traces[7], + rsp.n_traces[8], rsp.n_traces[9], rsp.n_traces[10]); + } + + GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u%s\n", + shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec, evt_str); } } }; @@ -3901,6 +3991,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH"); const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE"); const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL"); + const char * str_optrace = getenv("GGML_HEXAGON_OPTRACE"); const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER"); const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); const char * str_etm = getenv("GGML_HEXAGON_ETM"); @@ -3939,6 +4030,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll; + opt_optrace = str_optrace ? strtoul(str_optrace, NULL, 0) : (opt_opbatch * 128); opt_profile = str_profile ? atoi(str_profile) : 0; opt_etm = str_etm ? atoi(str_etm) : 0; opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index f4b44fe1a..31ba52762 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -37,8 +37,8 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx) if (_hmx_idx GREATER_EQUAL 0) target_sources(${HTP_LIB} PRIVATE - hmx-matmul-ops.c hmx-flash-attn-ops.c + hmx-matmul-ops.c hmx-queue.c ) diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c index e99621469..b7511cd64 100644 --- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c @@ -339,6 +339,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * if (ir0 >= ir1) return; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + dma_queue * dma = octx->ctx->dma[ith]; const uint32_t DK = nek0; @@ -615,6 +618,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } int op_flash_attn_ext(struct htp_ops_context * octx) { diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h index 7685473f4..93c21ebe5 100644 --- a/ggml/src/ggml-hexagon/htp/hex-dma.h +++ b/ggml/src/ggml-hexagon/htp/hex-dma.h @@ -6,6 +6,8 @@ #include #include +#include "hex-profile.h" + #ifdef __cplusplus extern "C" { #endif @@ -88,6 +90,7 @@ typedef struct { uint32_t pop_idx; uint32_t capacity; uint32_t idx_mask; + struct htp_thread_trace * trace; } dma_queue; dma_queue * dma_queue_create(size_t capacity); @@ -152,6 +155,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t q->dptr[q->push_idx] = dptr; if (size) { + htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx); dmlink(q->tail, desc); q->tail = (dma_descriptor_2d *) desc; } else { @@ -202,6 +206,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t q->dptr[q->push_idx] = dptr; if (nrows) { + htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx); dmlink(q->tail, desc); q->tail = desc; } else { @@ -223,10 +228,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) { dma_descriptor_2d * desc = &q->desc[q->pop_idx]; // Wait for desc to complete - while (!desc->done) { - // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); - dmpoll(); + if (!desc->done) { + while (!desc->done) { + dmpoll(); + } } + htp_trace_event_stop(q->trace, HTP_TRACE_EVT_DMA, q->pop_idx); dptr = q->dptr[q->pop_idx]; diff --git a/ggml/src/ggml-hexagon/htp/hex-profile.h b/ggml/src/ggml-hexagon/htp/hex-profile.h new file mode 100644 index 000000000..8a37a4a06 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hex-profile.h @@ -0,0 +1,64 @@ +#ifndef HEX_PROFILE_H +#define HEX_PROFILE_H + +#include +#include +#include + +#include "hex-utils.h" +#include "htp-ops.h" + +#define HTP_TRACE_EVT_START 0 +#define HTP_TRACE_EVT_STOP 1 + +#ifndef HEX_NUM_PMU_COUNTERS +#define HEX_NUM_PMU_COUNTERS 8 +#endif + +static inline void hex_get_pmu(uint32_t counters[]) { +#if __HVX_ARCH__ >= 79 + asm volatile("%0 = upmucnt0" : "=r"(counters[0])); + asm volatile("%0 = upmucnt1" : "=r"(counters[1])); + asm volatile("%0 = upmucnt2" : "=r"(counters[2])); + asm volatile("%0 = upmucnt3" : "=r"(counters[3])); + asm volatile("%0 = upmucnt4" : "=r"(counters[4])); + asm volatile("%0 = upmucnt5" : "=r"(counters[5])); + asm volatile("%0 = upmucnt6" : "=r"(counters[6])); + asm volatile("%0 = upmucnt7" : "=r"(counters[7])); +#else + counters[0] = qurt_pmu_get(QURT_PMUCNT0); + counters[1] = qurt_pmu_get(QURT_PMUCNT1); + counters[2] = qurt_pmu_get(QURT_PMUCNT2); + counters[3] = qurt_pmu_get(QURT_PMUCNT3); + counters[4] = qurt_pmu_get(QURT_PMUCNT4); + counters[5] = qurt_pmu_get(QURT_PMUCNT5); + counters[6] = qurt_pmu_get(QURT_PMUCNT6); + counters[7] = qurt_pmu_get(QURT_PMUCNT7); +#endif +} + +struct htp_thread_trace { + uint32_t count; + uint32_t max_events; + struct htp_trace_desc * events; +}; + +static inline void htp_trace_event(struct htp_thread_trace * tr, uint16_t id, uint16_t info, uint32_t type) { + if (tr && tr->events && tr->count < tr->max_events) { + uint32_t idx = tr->count; + tr->events[idx].id = id; + tr->events[idx].info = info | (type == HTP_TRACE_EVT_STOP ? 0x8000 : 0); + tr->events[idx].cycles = (uint32_t) hex_get_cycles(); + tr->count++; + } +} + +static inline void htp_trace_event_start(struct htp_thread_trace * tr, uint16_t id, uint16_t info) { + htp_trace_event(tr, id, info, HTP_TRACE_EVT_START); +} + +static inline void htp_trace_event_stop(struct htp_thread_trace * tr, uint16_t id, uint16_t info) { + htp_trace_event(tr, id, info, HTP_TRACE_EVT_STOP); +} + +#endif /* HEX_PROFILE_H */ diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h index 6239ceff4..8e6e3ea75 100644 --- a/ggml/src/ggml-hexagon/htp/hex-utils.h +++ b/ggml/src/ggml-hexagon/htp/hex-utils.h @@ -107,31 +107,4 @@ static inline void hex_pause() { asm volatile(" pause(#255)\n"); } -#ifndef HEX_NUM_PMU_COUNTERS -#define HEX_NUM_PMU_COUNTERS 8 -#endif - -static inline void hex_get_pmu(uint32_t counters[]) { -#if __HVX_ARCH__ >= 79 - asm volatile("%0 = upmucnt0" : "=r"(counters[0])); - asm volatile("%0 = upmucnt1" : "=r"(counters[1])); - asm volatile("%0 = upmucnt2" : "=r"(counters[2])); - asm volatile("%0 = upmucnt3" : "=r"(counters[3])); - asm volatile("%0 = upmucnt4" : "=r"(counters[4])); - asm volatile("%0 = upmucnt5" : "=r"(counters[5])); - asm volatile("%0 = upmucnt6" : "=r"(counters[6])); - asm volatile("%0 = upmucnt7" : "=r"(counters[7])); -#else - counters[0] = qurt_pmu_get(QURT_PMUCNT0); - counters[1] = qurt_pmu_get(QURT_PMUCNT1); - counters[2] = qurt_pmu_get(QURT_PMUCNT2); - counters[3] = qurt_pmu_get(QURT_PMUCNT3); - counters[4] = qurt_pmu_get(QURT_PMUCNT4); - counters[5] = qurt_pmu_get(QURT_PMUCNT5); - counters[6] = qurt_pmu_get(QURT_PMUCNT6); - counters[7] = qurt_pmu_get(QURT_PMUCNT7); - // qurt_pmu_get_pmucnt(counters); -#endif -} - #endif /* HEX_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c index 2796564fb..986dde148 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c @@ -18,7 +18,7 @@ #include "ggml-common.h" #include "hex-dma.h" #include "hex-fastdiv.h" -#include "hmx-profile.h" +#include "hex-profile.h" #include "hmx-queue.h" #include "hmx-utils.h" #include "htp-ctx.h" @@ -367,8 +367,11 @@ static void fa_k_interleave_thread(unsigned int n, unsigned int i, void * data) return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); hmx_interleave_rows_to_tiles(factx->vtcm_k_tiles, factx->vtcm_k_fp16[args->buf_idx], total_rows, (int) factx->DK, (int) args->src_stride, start, end); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_k_interleave(struct hmx_fa_context * factx, int kv_rows, size_t src_stride, size_t buf_idx) { @@ -408,8 +411,11 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data) __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0]; + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); hmx_interleave_cols_to_tiles(v_tiles_dest, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV, (int) args->src_stride, (int) args->n_col_tiles, start, end); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_v_interleave(struct hmx_fa_context * factx, @@ -462,6 +468,9 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); + const struct htp_tensor * q = args->q; const uint32_t q_start = args->q_start; const uint32_t kv_head = args->kv_head; @@ -515,6 +524,7 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) { } } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_q_load(struct hmx_fa_context * factx, @@ -566,6 +576,9 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); + const struct htp_tensor * dst = args->dst; const __fp16 * o_tile_src = args->o_tile_src; const uint32_t q_start = args->q_start; @@ -611,6 +624,7 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) { } } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_o_store(struct hmx_fa_context * factx, @@ -680,6 +694,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, vec_start); + // Per-thread row scratch: thread i uses bufs at offset i * 2 * stride const size_t row_buf_stride = factx->row_buf_stride; HVX_Vector * my_row_buf0 = factx->vtcm_row_bufs + i * 2 * row_buf_stride; @@ -950,6 +967,7 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { factx->vtcm_s_rowmax[r_vec_idx] = rowmax_acc_v; factx->vtcm_p_rowsum[r_vec_idx] = rowsum_acc_v; } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, vec_start); } // Serial m/l update + build_D. Must run after softmax barrier (s_rowmax written by all threads). @@ -1245,6 +1263,7 @@ static __attribute__((noinline)) void fa_compute_slopes( // ============================================================================ int hmx_flash_attn_ext(struct htp_ops_context * octx) { + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[HTP_MAX_NTHREADS] : NULL; const struct htp_tensor * q = octx->src[0]; const struct htp_tensor * k = octx->src[1]; const struct htp_tensor * v = octx->src[2]; @@ -1422,19 +1441,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { return HTP_STATUS_OK; } - // Profiling timers - TIMER_DEFINE(total); - TIMER_DEFINE(q_load); - TIMER_DEFINE(kv_dma); - TIMER_DEFINE(k_interleave); - TIMER_DEFINE(v_interleave); - TIMER_DEFINE(qk_dot); - TIMER_DEFINE(softmax); - TIMER_DEFINE(o_update); - TIMER_DEFINE(o_norm); - TIMER_DEFINE(o_store); - - TIMER_START(total); // ======== DMA setup ======== dma_queue * const dma = ctx->dma[0]; @@ -1474,12 +1480,10 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t n_row_tiles = g_br_actual / HMX_FP16_TILE_N_ROWS; // ---- Load Q block [g_br, D] -> tiles, interleaving G heads ---- - TIMER_START(q_load); if (n_rows_g < g_br) { hvx_splat_u8_a(factx.vtcm_q_tiles, 0, q_tile_bytes); } fa_phase_q_load(&factx, q, q_start, kv_head, ib3, n_rows_g); - TIMER_STOP(q_load); // ---- Initialize per-block state ---- hvx_splat_u8_a(factx.vtcm_l_vec, 0, col_vec_bytes); @@ -1558,10 +1562,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS); // Wait for current KV DMA - TIMER_START(kv_dma); dma_queue_pop(dma); // K dma_queue_pop(dma); // V - TIMER_STOP(kv_dma); // Push mask DMA for this block (single 2D DMA when broadcast) bool has_mask_dma = false; @@ -1583,10 +1585,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { ou_job.DV = DV; hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job)); } - - TIMER_START(k_interleave); fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx); - TIMER_STOP(k_interleave); // ---- Phase 2: qk_dot(blk) on HMX ‖ V_int(blk) + DMA prefetch on HVX ---- qk_job.q_tiles = factx.vtcm_q_tiles; @@ -1597,15 +1596,11 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { qk_job.n_dot_tiles = DK / 32; qk_job.n_tiles_per_bc = n_tiles_per_bc; qk_job.hmx_scales = factx.vtcm_hmx_scales_qk; - TIMER_START(qk_dot); hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_qk_dot_worker, &qk_job)); // DMA push next block (non-blocking, before worker_pool) DMA_PREFETCH_KV(kv_blk + 1); - - TIMER_START(v_interleave); fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc); - TIMER_STOP(v_interleave); // Pop and swap previous block's output update (deferred HMX pop) if (kv_blk > 0) { @@ -1615,7 +1610,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { // Pop current block's dot product job hmx_queue_pop(hmx_q); - TIMER_STOP(qk_dot); // ---- Phase 3: softmax(blk) + build_D(blk) | HMX idle ---- // Pop mask DMA before softmax (ensures VTCM buffer is ready) @@ -1641,10 +1635,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL; sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride; sargs.slopes = factx.vtcm_slopes; - - TIMER_START(softmax); fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br); - TIMER_STOP(softmax); buf_idx = 1 - buf_idx; } // end KV block loop (pipeline) @@ -1664,11 +1655,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { ou_job.n_row_tiles_g_br = n_row_tiles_g_br; ou_job.n_tiles_per_bc = n_tiles_per_bc; ou_job.DV = DV; - - TIMER_START(o_update); hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job)); hmx_queue_pop(hmx_q); - TIMER_STOP(o_update); hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev); } @@ -1683,23 +1671,14 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const uint32_t kv_start = kv_blk * Bc; const uint32_t kv_rows = hex_smin(Bc, nek1 - kv_start); const size_t n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS); - - TIMER_START(kv_dma); dma_queue_pop(dma); // K dma_queue_pop(dma); // V - TIMER_STOP(kv_dma); bool has_mask_dma = false; MASK_DMA_PUSH(kv_start, kv_rows, has_mask_dma); DMA_PREFETCH_KV(kv_blk + 1); - - // K interleave (multi-thread HVX) - TIMER_START(k_interleave); fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx); - TIMER_STOP(k_interleave); - // QK dot (inline HMX on main thread) - TIMER_START(qk_dot); { const size_t n_dot_tiles = (size_t) (DK / 32); const __fp16 * restrict q_base = factx.vtcm_q_tiles; @@ -1709,6 +1688,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_col_tiles > 0); __builtin_assume(n_dot_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_qk); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < n_col_tiles; ++c) { @@ -1724,8 +1704,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(out_tile, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); } - TIMER_STOP(qk_dot); // Pop mask DMA MASK_DMA_POP(has_mask_dma); @@ -1751,21 +1731,9 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL; sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride; sargs.slopes = factx.vtcm_slopes; - - TIMER_START(softmax); fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br); - TIMER_STOP(softmax); - - // V interleave (multi-thread HVX) - TIMER_START(v_interleave); - // FIX(v-stride): use n_tiles_per_bc (block-invariant) as V tile layout - // stride to match o_update's v_tile access. Using per-block n_col_tiles - // misplaces DV_tile 1..3 in the last partial KV block. fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc); - TIMER_STOP(v_interleave); - // O update (inline HMX on main thread) - TIMER_START(o_update); { const size_t DV_tiles = (size_t) (DV / 32); const __fp16 * restrict d_base = factx.vtcm_d_tiles; @@ -1777,6 +1745,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_col_tiles > 0); __builtin_assume(DV_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < DV_tiles; ++c) { @@ -1798,16 +1767,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(o_tile_out, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev); } - TIMER_STOP(o_update); buf_idx = 1 - buf_idx; } // end KV block loop (fallback) } // ---- Final normalization: O = diag(1/l) @ O ---- - TIMER_START(o_norm); { fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br); @@ -1830,6 +1798,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_row_tiles > 0); __builtin_assume(DV_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < DV_tiles; ++c) { @@ -1842,14 +1811,12 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(o_out, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); } } - TIMER_STOP(o_norm); // ---- Store O block ---- - TIMER_START(o_store); fa_phase_o_store(&factx, dst, o_tile_curr, q_start, kv_head, ib3, n_rows_g); - TIMER_STOP(o_store); #undef MASK_DMA_PUSH #undef MASK_DMA_POP @@ -1865,14 +1832,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); } - TIMER_STOP(total); -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "hmx-fa: %lld us, q_load=%lld kv_dma=%lld k_interleave=%lld v_interleave=%lld", TIMER_US(total), - TIMER_US(q_load), TIMER_US(kv_dma), TIMER_US(k_interleave), TIMER_US(v_interleave)); - FARF(HIGH, " qk_dot=%lld softmax=%lld o_update=%lld o_norm=%lld o_store=%lld", TIMER_US(qk_dot), TIMER_US(softmax), - TIMER_US(o_update), TIMER_US(o_norm), TIMER_US(o_store)); -#endif return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index dab605210..5c37f24ff 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -27,7 +27,7 @@ #include "hmx-ops.h" #include "hmx-utils.h" #include "hmx-queue.h" -#include "hmx-profile.h" +#include "hex-profile.h" #include "vtcm-utils.h" @@ -430,6 +430,7 @@ typedef struct { int n_tasks; int n_k_tiles; struct fastdiv_values n_k_tiles_div; + struct htp_thread_trace * traces; } x4x2_dequantize_state_t; // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16. @@ -533,11 +534,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix( \ static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) { \ x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; \ + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); \ for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { \ int start = task_id * state->n_tiles_per_task; \ int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); \ dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end); \ } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); \ } DEFINE_DEQUANTIZE_Q4_TASK(q4_0, q4_0_to_fp16_lut, q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16)) @@ -657,11 +661,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4( static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) { x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { int start = task_id * state->n_tiles_per_task; int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); } static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0( @@ -717,11 +724,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0( static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) { x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { int start = task_id * state->n_tiles_per_task; int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); } static void convert_f16_weight_to_fp16_tiles_task( @@ -773,11 +783,14 @@ static void convert_f16_weight_to_fp16_tiles_task( static void convert_f16_worker_loop(unsigned int n, unsigned int i, void *data) { x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { int start = task_id * state->n_tiles_per_task; int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); convert_f16_weight_to_fp16_tiles_task(state, start, end); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); } static void quantize_f32_weight_to_fp16_tiles_task( @@ -833,11 +846,14 @@ static void quantize_f32_weight_to_fp16_tiles_task( static void quantize_f32_worker_loop(unsigned int n, unsigned int i, void *data) { x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { int start = task_id * state->n_tiles_per_task; int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); quantize_f32_weight_to_fp16_tiles_task(state, start, end); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); } @@ -868,6 +884,7 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles( state.weight_type = weight_type; state.n_k_tiles = n_k_tiles; state.n_k_tiles_div = n_k_tiles_div; + state.traces = ctx ? ctx->trace : NULL; if (state.n_tasks == 1 || n_threads == 1) { dequant_worker_fn(1, 0, &state); @@ -985,10 +1002,13 @@ typedef struct { int n_chunks_per_task; int n_cols; int n; // DDR row stride (total output columns) + struct htp_thread_trace * traces; } output_transfer_task_state_t; static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { output_transfer_task_state_t *st = (output_transfer_task_state_t *) data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i); for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { int chunk_idx = task_id * st->n_chunks_per_task; @@ -998,6 +1018,7 @@ static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void const __fp16 *vtcm_src = st->vtcm_src + chunk_idx * st->n_cols; transfer_output_chunk_fp16_to_fp32(dst, vtcm_src, chunk_size, st->n_cols, st->n); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i); } static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src, @@ -1015,6 +1036,7 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, state.vtcm_src = vtcm_src; state.n_cols = n_cols; state.n = n; + state.traces = ctx ? ctx->trace : NULL; if (state.n_tasks == 1 || n_threads == 1) { transfer_output_chunk_worker_fn(1, 0, &state); @@ -1086,10 +1108,13 @@ typedef struct { int n_chunks_per_task; int k_block; int k_stride; + struct htp_thread_trace * traces; } activation_transfer_task_state_t; static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i); for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { // one chunk: one row @@ -1100,6 +1125,7 @@ static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, const float *src = st->src + chunk_idx * st->k_stride; transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i); } static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride, int n_threads) { @@ -1117,6 +1143,7 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 * state.src = src; state.k_block = k_block; state.k_stride = k_stride; + state.traces = ctx ? ctx->trace : NULL; if (state.n_tasks == 1 || n_threads == 1) { transfer_activation_chunk_worker_fn(1, 0, &state); @@ -1245,13 +1272,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float FARF(HIGH, "hmx-mm-2d: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu", m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget); - TIMER_DEFINE(activation_load); - TIMER_DEFINE(weight_load); - TIMER_DEFINE(hmx_core); - TIMER_DEFINE(output_store); - TIMER_DEFINE(total); - TIMER_START(total); int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); @@ -1370,7 +1391,12 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); // C: HMX Compute (Synchronous) - core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); + { + struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); + core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); + } // D: Output Store float *output_chunk = dst + (mr * n + nc); @@ -1380,18 +1406,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); } - TIMER_STOP(total); -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "hex-mm-2d: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n); - if (!use_pipeline) { - FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", - TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); - size_t weight_size = (size_t)n * row_stride; - float bandwidth = 1e-3f * weight_size / (float)TIMER_US(weight_load); - FARF(HIGH, " weight load bandwidth: %.2f GB/s", bandwidth); - } -#endif return 0; } @@ -1523,13 +1538,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 m_chunk_n_rows, n_chunk_n_cols, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); - TIMER_DEFINE(activation_load); - TIMER_DEFINE(weight_load); - TIMER_DEFINE(hmx_core); - TIMER_DEFINE(output_store); - TIMER_DEFINE(total); - TIMER_START(total); const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16); const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16); @@ -1549,7 +1558,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 // contiguous rows into a VTCM scratch buffer first, then HVX // converts from the contiguous VTCM buffer. This avoids L2 cache // thrashing from HVX loads at large strides. - TIMER_START(activation_load); for (int g = 0; g < group_size; ++g) { const float *activation_chunk = hmx_matmul_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride; __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; @@ -1569,7 +1577,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 params->k, params->act_stride, ctx->n_threads); } } - TIMER_STOP(activation_load); void *buf_curr = vtcm_scratch0; void *buf_next = vtcm_scratch1; @@ -1584,7 +1591,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols); const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS); - TIMER_START(weight_load); { dma_queue_pop(ctx->dma[0]); @@ -1601,24 +1607,22 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 0, n_cols); hex_swap_ptr(&buf_curr, &buf_next); } - TIMER_STOP(weight_load); // Reuse the interleaved weight for every q_head in this GQA group for (int g = 0; g < group_size; ++g) { - TIMER_START(hmx_core); { const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; + struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, params->k / 32); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); } - TIMER_STOP(hmx_core); - TIMER_START(output_store); { float *output = hmx_matmul_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc; transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride, ctx->n_threads); } - TIMER_STOP(output_store); } } } @@ -1627,14 +1631,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32 HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - TIMER_STOP(total); -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d group=%d", __func__, TIMER_US(total), - params->m, params->k, params->n, group_size); - FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", - TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); -#endif return 0; } @@ -1668,6 +1665,7 @@ typedef struct { size_t nb12; int start_row; int cne1; + struct htp_thread_trace *traces; } activation_transfer_gathered_task_state_t; typedef struct { @@ -1684,6 +1682,7 @@ typedef struct { size_t dst_nb2; int start_row; int cne1; + struct htp_thread_trace *traces; } output_transfer_scattered_task_state_t; static void transfer_activation_chunk_fp32_to_fp16_gathered( @@ -1780,6 +1779,9 @@ static void transfer_activation_chunk_fp32_to_fp16_gathered( static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigned int i, void *data) { activation_transfer_gathered_task_state_t *st = data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i); + int chunk_idx = i; int chunk_size = st->n_chunks_per_task; int start_row = st->start_row + chunk_idx * chunk_size; @@ -1791,6 +1793,7 @@ static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigne st->matrix_rows, st->cur_a, st->mapping_stride, st->ne11, &st->ne11_div, st->nb11, st->nb12, st->cne1); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i); } static void transfer_activation_chunk_gathered_threaded( @@ -1830,6 +1833,7 @@ static void transfer_activation_chunk_gathered_threaded( .nb12 = nb12, .start_row = start_row, .cne1 = cne1, + .traces = ctx ? ctx->trace : NULL, }; if (actual_threads <= 1) { @@ -1895,6 +1899,9 @@ static void transfer_output_chunk_fp16_to_fp32_scattered( static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned int i, void *data) { output_transfer_scattered_task_state_t *st = data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i); + int chunk_idx = i; int chunk_size = st->n_chunks_per_task; int start_row = st->start_row + chunk_idx * chunk_size; @@ -1906,6 +1913,7 @@ static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned i st->matrix_rows, st->cur_a, st->mapping_stride, st->dst_nb1, st->dst_nb2, st->cne1); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i); } static void transfer_output_chunk_scattered_threaded( @@ -1942,6 +1950,7 @@ static void transfer_output_chunk_scattered_threaded( .dst_nb2 = dst_nb2, .start_row = start_row, .cne1 = cne1, + .traces = ctx ? ctx->trace : NULL, }; if (actual_threads <= 1) { @@ -2053,7 +2062,12 @@ int hmx_matmul_id_2d_f32(struct htp_context *ctx, dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); + { + struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); + core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); + } transfer_output_chunk_scattered_threaded( ctx, dst, vtcm_output, (int) mr, (int) n_rows, (int) n_cols, diff --git a/ggml/src/ggml-hexagon/htp/hmx-profile.h b/ggml/src/ggml-hexagon/htp/hmx-profile.h deleted file mode 100644 index 01eece720..000000000 --- a/ggml/src/ggml-hexagon/htp/hmx-profile.h +++ /dev/null @@ -1,34 +0,0 @@ -// Conditional fine-grained profiling macros for HMX operations. -// -// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this -// header) to instrument sub-operation latencies with HAP qtimer. When the -// macro is not defined the TIMER_* helpers expand to nothing so there is zero -// overhead. -// -// Usage: -// TIMER_DEFINE(my_phase); // declare accumulator variable -// TIMER_START(my_phase); // snapshot start time -// ... work ... -// TIMER_STOP(my_phase); // accumulate elapsed ticks -// FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase)); - -#ifndef HMX_PROFILE_H -#define HMX_PROFILE_H - -#include - -// #define ENABLE_PROFILE_TIMERS - -#if defined(ENABLE_PROFILE_TIMERS) -# define TIMER_DEFINE(name) int64_t name##_ticks = 0 -# define TIMER_START(name) int64_t name##_t0 = HAP_perf_get_qtimer_count() -# define TIMER_STOP(name) name##_ticks += HAP_perf_get_qtimer_count() - name##_t0 -# define TIMER_US(name) HAP_perf_qtimer_count_to_us(name##_ticks) -#else -# define TIMER_DEFINE(name) -# define TIMER_START(name) -# define TIMER_STOP(name) -# define TIMER_US(name) 0LL -#endif - -#endif // HMX_PROFILE_H diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.c b/ggml/src/ggml-hexagon/htp/hmx-queue.c index 5b1d83a0c..a0007539c 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-queue.c +++ b/ggml/src/ggml-hexagon/htp/hmx-queue.c @@ -44,7 +44,9 @@ static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) { case HMX_QUEUE_SUSPEND: hmx_unlock(q); break; default: hmx_lock(q); + htp_trace_event_start(q->trace, HTP_TRACE_EVT_HMX_COMP, ir); d->func(d->data); + htp_trace_event_stop(q->trace, HTP_TRACE_EVT_HMX_COMP, ir); break; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.h b/ggml/src/ggml-hexagon/htp/hmx-queue.h index 0d48c280f..83135cd91 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-queue.h +++ b/ggml/src/ggml-hexagon/htp/hmx-queue.h @@ -11,6 +11,7 @@ #include #include "hex-utils.h" +#include "hex-profile.h" #ifdef __cplusplus extern "C" { @@ -47,6 +48,7 @@ struct hmx_queue { void * stack; uint32_t hap_rctx; bool hmx_locked; + struct htp_thread_trace * trace; }; struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx); diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 0f1676f07..cbb5d0878 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -4,6 +4,7 @@ #include "hex-dma.h" #include "hmx-queue.h" #include "htp-ops.h" +#include "hex-profile.h" #include "worker-pool.h" #include @@ -70,6 +71,7 @@ struct htp_context { bool hmx_enabled; bool etm; uint32_t profiler; + struct htp_thread_trace trace[HTP_MAX_NTHREADS + 1]; uint8_t * vtcm_base; size_t vtcm_size; diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index fa85bf4ca..0f4b74a93 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -146,10 +146,36 @@ struct htp_op_desc { uint16_t dst; // Output tensor index }; +#ifndef HTP_MAX_NTHREADS +#define HTP_MAX_NTHREADS 10 +#endif + +#define HTP_TRACE_MAX_EVENTS 256 + enum htp_profiler_mode { HTP_PROF_DISABLED = 0, HTP_PROF_BASIC = 1, HTP_PROF_PMU = 2, + HTP_PROF_TRACE = 3, +}; + +enum htp_trace_event_id { + HTP_TRACE_EVT_DMA = 0, + + HTP_TRACE_EVT_HVX_COMP = 20, + HTP_TRACE_EVT_HVX_A_QUANT = 21, + HTP_TRACE_EVT_HVX_A_PREP = 22, + HTP_TRACE_EVT_HVX_W_DEQUANT = 23, + HTP_TRACE_EVT_HVX_W_PREP = 24, + HTP_TRACE_EVT_HVX_O_PROC = 25, + + HTP_TRACE_EVT_HMX_COMP = 40, +}; + +struct htp_trace_desc { + uint32_t cycles; // lower 32-bits of cycle counter + uint16_t id; // Event ID + uint16_t info; // bit 15: is_stop. bits 14-0: tile/chunk index or other metadata. }; #define HTP_PROF_PMU_NCNT 8 @@ -158,8 +184,8 @@ enum htp_profiler_mode { struct htp_prof_desc { uint32_t opcode; // GGML/HTP Op uint32_t usecs; // Number of usec - uint32_t cycles; // Number of cycles - uint32_t pad; // Unused + uint32_t cycles_start; // Start cycle counter + uint32_t cycles_stop; // Stop cycle counter uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters }; @@ -168,7 +194,7 @@ struct htp_opbatch_req { uint32_t n_bufs; // Number of buffers uint32_t n_tensors; // Number of tensors uint32_t n_ops; // Number of ops - uint32_t flags; // unused + uint32_t n_traces; // Number of trace descriptors per thread uint32_t pad; // unused // struct htp_buf_desc bufs[]; -- dspqueue buf 0 // struct htp_tensor tensors[]; -- dspqueue buf 0 @@ -181,7 +207,8 @@ struct htp_opbatch_rsp { uint32_t n_bufs; // Number of buffers uint32_t n_tensors; // Number of tensors uint32_t n_ops; // Number of op profile descriptors - uint32_t pad; // unused + uint32_t n_traces[HTP_MAX_NTHREADS + 1]; + uint8_t pad[8]; // align to 8 bytes // struct htp_prof_desc profs[]; -- dspqueue buf 0 }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 3715227d2..53ab33c07 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -400,7 +400,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->hmx_queue = NULL; if (use_hmx) { ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx); - if (!ctx->hmx_queue) { + if (ctx->hmx_queue) { + ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS]; + } else { FARF(ERROR, "hmx-queue-create failed"); ctx->hmx_enabled = false; } @@ -425,6 +427,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { ctx->dma[i] = dma_queue_create(256); // queue depth + if (ctx->dma[i]) { + ctx->dma[i]->trace = &ctx->trace[i]; + } } ctx->ddr_spad_size = 512 * 1024; // 512 KB @@ -502,7 +507,8 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) { struct profile_data { uint64_t usecs; - uint64_t cycles; + uint64_t cycles_start; + uint64_t cycles_stop; uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS]; }; @@ -512,8 +518,9 @@ static inline void profile_start(uint32_t mode, struct profile_data * d) { hex_get_pmu(d->pmu_counters); // fallthrough case HTP_PROF_BASIC: + case HTP_PROF_TRACE: d->usecs = HAP_perf_get_qtimer_count(); - d->cycles = hex_get_cycles(); + d->cycles_start = hex_get_cycles(); break; default: break; @@ -530,8 +537,9 @@ static inline void profile_stop(uint32_t mode, struct profile_data * d) { } // fallthrough case HTP_PROF_BASIC: + case HTP_PROF_TRACE: d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs); - d->cycles = hex_get_cycles() - d->cycles; + d->cycles_stop = hex_get_cycles(); break; default: break; @@ -845,14 +853,15 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { const uint32_t t_size = sizeof(struct htp_tensor) * n_tens; const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops; const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops; + const uint32_t tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(struct htp_trace_desc); - if (dbuf.size < b_size + t_size + o_size + p_size) { - FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size); + if (dbuf.size < b_size + t_size + o_size + p_size + tr_size) { + FARF(ERROR, "invalid opbatch memory block size %u (req %u)", dbuf.size, b_size + t_size + o_size + p_size + tr_size); break; } - FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id, - n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size); + FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u n-traces %u : m-size %u b-size %u t-size %u o-size %u", req.id, + n_bufs, n_tens, n_ops, req.n_traces, dbuf.size, b_size, t_size, o_size); // Setup descriptor pointers uint8_t * m_ptr = dbuf.ptr; @@ -869,6 +878,20 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { octx->n_threads = ctx->n_threads; octx->ctx = ctx; + if (ctx->profiler == HTP_PROF_TRACE) { + memset(ctx->trace, 0, sizeof(ctx->trace)); + struct htp_trace_desc * trace_events = (struct htp_trace_desc *) (m_ptr + p_size); + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + ctx->trace[t].events = &trace_events[t * req.n_traces]; + ctx->trace[t].max_events = req.n_traces; + } + } else { + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + ctx->trace[t].events = NULL; + ctx->trace[t].max_events = 0; + } + } + for (uint32_t i=0; i < n_ops; i++) { struct profile_data prof; @@ -886,7 +909,8 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { if (ctx->profiler) { pds[i].opcode = ops[i].opcode; pds[i].usecs = prof.usecs; - pds[i].cycles = prof.cycles; + pds[i].cycles_start = prof.cycles_start; + pds[i].cycles_stop = prof.cycles_stop; for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) { pds[i].pmu[j] = prof.pmu_counters[j]; } @@ -899,6 +923,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { rsp.n_bufs = n_bufs; rsp.n_tensors = n_tens; rsp.n_ops = n_ops; + memset(rsp.pad, 0, sizeof(rsp.pad)); + if (ctx->profiler == HTP_PROF_TRACE) { + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + rsp.n_traces[t] = ctx->trace[t].count; + } + } else { + memset(rsp.n_traces, 0, sizeof(rsp.n_traces)); + } dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 5121c6f9b..8e016c1be 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -3350,6 +3350,7 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -3411,10 +3412,12 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end); + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, iir0); for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) { const uint8_t * restrict src0_row = src0_base + ir0 * nb01; mmctx->vec_dot_1x1(ne00, &dst_col[ir0], src0_row, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, iir0); } } } @@ -3430,6 +3433,7 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { // src1 tensor is already in VTCM spad static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows const uint32_t src1_nrows = ne11 * ne12 * ne13; // src1 rows @@ -3477,6 +3481,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + // Process src1 columns in pairs (2×2 tiling) uint32_t ir1 = 0; for (; ir1 + 1 < src1_nrows; ir1 += 2) { @@ -3494,6 +3500,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + // Prefetch next (n + spad_nrows) row const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; @@ -3511,12 +3519,14 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { src0_stride, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); #pragma unroll(2) for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } t2 = HAP_perf_get_qtimer_count(); @@ -3530,6 +3540,7 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { // q8x4x2 src1 tensor is already in VTCM spad static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const uint32_t src0_nrows = ne01; @@ -3581,7 +3592,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { // Process src0 rows for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); // Prefetch next (n + spad_nrows) row const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); @@ -3599,7 +3612,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), src0_stride, src0_row_size, 2); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); ir0 += 2; } if (ir0 < src0_end_row) { @@ -3607,7 +3622,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), src0_stride, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); ir0 += 1; } } else { @@ -3627,7 +3644,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { // Process src0 rows for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); // Prefetch next (n + spad_nrows) row const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); @@ -3645,7 +3664,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), src0_stride, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } } @@ -3669,6 +3690,7 @@ struct mmid_row_mapping { // src1 tensor is already in VTCM spad static void matmul_id(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * restrict ids = octx->src[2]; struct htp_spad * restrict src2_spad = &octx->src2_spad; @@ -3735,6 +3757,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); for (uint32_t cid = 0; cid < cne1; ++cid) { struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); const int rm1 = row_mapping.i1; // expert idx @@ -3746,6 +3769,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); // Prefetch next (n + spad_nrows) row const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); @@ -3764,6 +3788,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); for (uint32_t cid = 0; cid < cne1; ++cid) { struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); const int rm1 = row_mapping.i1; // expert idx @@ -3775,6 +3800,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } } @@ -3789,6 +3815,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { // src1 tensor is already in VTCM spad static void matvec_id(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * restrict ids = octx->src[2]; struct htp_spad * restrict src2_spad = &octx->src2_spad; @@ -3847,7 +3874,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) { // Process src0 rows for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); // Prefetch next (n + spad_nrows) row const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); @@ -3865,7 +3894,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) { dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size), src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } } @@ -4147,6 +4178,7 @@ static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, ui static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; @@ -4163,6 +4195,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows const uint32_t ir_first = nrows_per_thread * ith; // first row + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row const size_t src_row_size = src->nb[1]; @@ -4189,6 +4222,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); } static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { @@ -4219,6 +4253,7 @@ static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; @@ -4235,6 +4270,7 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows const uint32_t ir_first = nrows_per_thread * ith; // first row + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row const size_t src_row_size = src->nb[1]; @@ -4260,11 +4296,13 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); } static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; @@ -4281,6 +4319,7 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) { const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows const uint32_t ir_first = nrows_per_thread * ith; // first row + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row const size_t src_row_size = ne0 * sizeof(float); @@ -4301,11 +4340,13 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) { FARF(HIGH, "quantize-f32-f32: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); } static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; @@ -4322,6 +4363,7 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows const uint32_t ir_first = nrows_per_thread * ith; // first row + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row const size_t src_row_size = ne0 * sizeof(float); @@ -4342,12 +4384,14 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); } // TODO just a plain copy that should be done via the DMA during the Op setup static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; @@ -4364,6 +4408,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) { const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows const uint32_t ir_first = nrows_per_thread * ith; // first row + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row const size_t src_row_size = ne0 * sizeof(float); @@ -4384,6 +4429,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) { FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); }