From 5cbc2153c253e32103be889c0d82d361fb78d819 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 18 Jun 2026 08:35:02 -0700
Subject: [PATCH] hexagon: support for op-trace (fine-grain tracing of
 HVX/HMX/DMA events) (llama/24592)

* hex-optrace: add support for optrace and instrument matmul and flash-atten code

* hex-trace: improve trace event and prefetto generator

* hex-trace: add new script dedicated to handling traces, specifically perfetto traces

* hex-trace: add --head/--tail options to profile and trace tools

* hex-trace: fix whitespaces

* hex-trace: fix flake8 warnings

* hex-trace: fix flake8 warnings

* hmx-fa: restore q_tiles clearing

* hex-profile: remove circular dep in includes

* hex-trace: simplify trace sizing check

* hex-profile: sort events in the summary by name
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        | 116 ++++++++++++++++--
 ggml/src/ggml-hexagon/htp/CMakeLists.txt      |   2 +-
 ggml/src/ggml-hexagon/htp/flash-attn-ops.c    |   4 +
 ggml/src/ggml-hexagon/htp/hex-dma.h           |  13 +-
 ggml/src/ggml-hexagon/htp/hex-profile.h       |  64 ++++++++++
 ggml/src/ggml-hexagon/htp/hex-utils.h         |  27 ----
 .../src/ggml-hexagon/htp/hmx-flash-attn-ops.c |  92 ++++----------
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c    |  96 ++++++++-------
 ggml/src/ggml-hexagon/htp/hmx-profile.h       |  34 -----
 ggml/src/ggml-hexagon/htp/hmx-queue.c         |   2 +
 ggml/src/ggml-hexagon/htp/hmx-queue.h         |   2 +
 ggml/src/ggml-hexagon/htp/htp-ctx.h           |   2 +
 ggml/src/ggml-hexagon/htp/htp-ops.h           |  35 +++++-
 ggml/src/ggml-hexagon/htp/main.c              |  50 ++++++--
 ggml/src/ggml-hexagon/htp/matmul-ops.c        |  46 +++++++
 15 files changed, 388 insertions(+), 197 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/htp/hex-profile.h
 delete mode 100644 ggml/src/ggml-hexagon/htp/hmx-profile.h

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 49bd7e433..e612ec392 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -69,6 +69,7 @@ static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static int opt_oppoll   = 0;    // polling for batch completions
+static int opt_optrace  = 0;    // trace buffer size per thread (0 means default)
 
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
 
@@ -118,20 +119,39 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
                 ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
 }
 
+static const char * htp_event_name(uint16_t id) {
+    switch (id) {
+        case HTP_TRACE_EVT_DMA:            return "DMA";
+        case HTP_TRACE_EVT_HVX_COMP:       return "HVX_COMP";
+        case HTP_TRACE_EVT_HVX_A_QUANT:    return "HVX_A_QUANT";
+        case HTP_TRACE_EVT_HVX_A_PREP:     return "HVX_A_PREP";
+        case HTP_TRACE_EVT_HVX_W_DEQUANT:  return "HVX_W_DEQUANT";
+        case HTP_TRACE_EVT_HVX_W_PREP:     return "HVX_W_PREP";
+        case HTP_TRACE_EVT_HVX_O_PROC:     return "HVX_O_PROC";
+        case HTP_TRACE_EVT_HMX_COMP:       return "HMX_COMP";
+        default:                           return "UNKNOWN";
+    }
+}
+
 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
-                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
+                                      const htp_prof_desc & pd) {
     if (!opt_profile) return;
 
+    uint32_t op_usec = pd.usecs;
+    uint32_t op_cycles = pd.cycles_stop - pd.cycles_start;
+    const uint32_t * pmu = pd.pmu;
+
     char pmu_str[256] = "";
-    if (opt_profile > 1) {
+    if (opt_profile == 2) {
         static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
         sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
                 pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
     }
 
     htp_opformat fmt(node);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
-            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
+    float mhz = op_usec > 0 ? (float) op_cycles / op_usec : 0.0f;
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u start %u mhz %.1f%s\n", sess_name.c_str(),
+            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pd.cycles_start, mhz, pmu_str);
 }
 
 // ** backend sessions
@@ -1995,10 +2015,16 @@ struct ggml_hexagon_opqueue {
         size_t n_ops     = batch_size;
         size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
 
+        size_t tr_size = 0;
+        if (opt_profile == 3) {
+            tr_size = (HTP_MAX_NTHREADS + 1) * opt_optrace * sizeof(htp_trace_desc);
+        }
+
         shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
                        sizeof(htp_tensor)    * n_tensors +
                        sizeof(htp_op_desc)   * n_ops     +
-                       sizeof(htp_prof_desc) * n_ops;
+                       sizeof(htp_prof_desc) * n_ops     +
+                       tr_size;
 
         shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
 
@@ -2042,11 +2068,19 @@ struct ggml_hexagon_opqueue {
         const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
         const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
 
+        size_t tr_size = 0;
+        if (opt_profile == 3) {
+            req.n_traces = opt_optrace;
+            tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(htp_trace_desc);
+        } else {
+            req.n_traces = 0;
+        }
+
         dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
         dbuf.fd       = shm_buf->fd;
         dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
         dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
-        dbuf.size     = b_size + t_size + o_size + p_size;
+        dbuf.size     = b_size + t_size + o_size + p_size + tr_size;
 
         GGML_ASSERT(dbuf.size <= shm_blk_size);
 
@@ -2092,7 +2126,14 @@ struct ggml_hexagon_opqueue {
         const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
         const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
 
-        const size_t m_size = b_size + t_size + o_size + p_size;
+        size_t tr_size = 0;
+        uint32_t n_traces = 0;
+        if (opt_profile == 3) {
+            n_traces = opt_optrace;
+            tr_size = (HTP_MAX_NTHREADS + 1) * n_traces * sizeof(htp_trace_desc);
+        }
+
+        const size_t m_size = b_size + t_size + o_size + p_size + tr_size;
         GGML_ASSERT(m_size <= shm_blk_size);
 
         HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
@@ -2111,13 +2152,62 @@ struct ggml_hexagon_opqueue {
             GGML_ASSERT(rsp.n_ops <= ops.size());
 
             const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
-            for (uint32_t i = 0; i < rsp.n_ops; i++) {
-                htp_usec += pd[i].usecs;
-                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+
+            const htp_trace_desc * trace_events = nullptr;
+
+            if (opt_profile == 3) {
+                trace_events = (const htp_trace_desc *) (p_ptr + p_size);
             }
 
-            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
-                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+            uint32_t trace_idx[HTP_MAX_NTHREADS + 1] = {0};
+            uint32_t valid_cnt[HTP_MAX_NTHREADS + 1] = {0};
+
+            if (opt_profile == 3) {
+                for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                    uint32_t count = rsp.n_traces[t];
+                    valid_cnt[t] = count > n_traces ? n_traces : count;
+                }
+            }
+
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i]);
+
+                if (opt_profile == 3) {
+                    uint32_t op_duration = pd[i].cycles_stop - pd[i].cycles_start;
+
+                    for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                        while (trace_idx[t] < valid_cnt[t]) {
+                            const auto & e = trace_events[t * n_traces + trace_idx[t]];
+                            uint32_t offset = e.cycles - pd[i].cycles_start;
+                            if (offset >= 0x80000000) {
+                                trace_idx[t]++;
+                                continue;
+                            }
+                            if (offset > op_duration) {
+                                break;
+                            }
+                            bool is_stop = (e.info & 0x8000) != 0;
+                            uint16_t info = e.info & 0x7FFF;
+                            GGML_LOG_DEBUG("ggml-hex: %s trace-op %s: thread %u event %s info %u %s %u\n",
+                                           shm_buf->sess->c_name(), ops[i].op_name().c_str(), t, htp_event_name(e.id), info, is_stop ? "stop" : "start", e.cycles);
+                            trace_idx[t]++;
+                        }
+                    }
+                }
+            }
+
+            char evt_str[256] = "";
+            if (opt_profile == 3) {
+                sprintf(evt_str, " evt [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u]",
+                        rsp.n_traces[0], rsp.n_traces[1], rsp.n_traces[2], rsp.n_traces[3],
+                        rsp.n_traces[4], rsp.n_traces[5], rsp.n_traces[6], rsp.n_traces[7],
+                        rsp.n_traces[8], rsp.n_traces[9], rsp.n_traces[10]);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u%s\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec, evt_str);
         }
     }
 };
@@ -3901,6 +3991,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     const char * str_opbatch  = getenv("GGML_HEXAGON_OPBATCH");
     const char * str_opqueue  = getenv("GGML_HEXAGON_OPQUEUE");
     const char * str_oppoll   = getenv("GGML_HEXAGON_OPPOLL");
+    const char * str_optrace  = getenv("GGML_HEXAGON_OPTRACE");
     const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
     const char * str_profile  = getenv("GGML_HEXAGON_PROFILE");
     const char * str_etm      = getenv("GGML_HEXAGON_ETM");
@@ -3939,6 +4030,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
     opt_opbatch   = str_opbatch  ? strtoul(str_opbatch, NULL, 0)          : opt_opbatch;
     opt_opqueue   = str_opqueue  ? strtoul(str_opqueue, NULL, 0)          : opt_opqueue;
     opt_oppoll    = str_oppoll   ? strtoul(str_oppoll,  NULL, 0)          : opt_oppoll;
+    opt_optrace   = str_optrace  ? strtoul(str_optrace, NULL, 0)          : (opt_opbatch * 128);
     opt_profile   = str_profile  ? atoi(str_profile)                      : 0;
     opt_etm       = str_etm      ? atoi(str_etm)                          : 0;
     opt_nhvx      = str_nhvx     ? strtoul(str_nhvx, NULL, 0)             : opt_nhvx;
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index f4b44fe1a..31ba52762 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -37,8 +37,8 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
 
 if (_hmx_idx GREATER_EQUAL 0)
     target_sources(${HTP_LIB} PRIVATE
-        hmx-matmul-ops.c
         hmx-flash-attn-ops.c
+        hmx-matmul-ops.c
         hmx-queue.c
     )
 
diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
index e99621469..b7511cd64 100644
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -339,6 +339,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
 
     if (ir0 >= ir1) return;
 
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
     dma_queue * dma = octx->ctx->dma[ith];
 
     const uint32_t DK = nek0;
@@ -615,6 +618,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
         }
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 }
 
 int op_flash_attn_ext(struct htp_ops_context * octx) {
diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h
index 7685473f4..93c21ebe5 100644
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@@ -6,6 +6,8 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#include "hex-profile.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -88,6 +90,7 @@ typedef struct {
     uint32_t            pop_idx;
     uint32_t            capacity;
     uint32_t            idx_mask;
+    struct htp_thread_trace * trace;
 } dma_queue;
 
 dma_queue * dma_queue_create(size_t capacity);
@@ -152,6 +155,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
     q->dptr[q->push_idx] = dptr;
 
     if (size) {
+        htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx);
         dmlink(q->tail, desc);
         q->tail = (dma_descriptor_2d *) desc;
     } else {
@@ -202,6 +206,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
     q->dptr[q->push_idx] = dptr;
 
     if (nrows) {
+        htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx);
         dmlink(q->tail, desc);
         q->tail = desc;
     } else {
@@ -223,10 +228,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
     dma_descriptor_2d * desc = &q->desc[q->pop_idx];
 
     // Wait for desc to complete
-    while (!desc->done) {
-        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
-        dmpoll();
+    if (!desc->done) {
+        while (!desc->done) {
+            dmpoll();
+        }
     }
+    htp_trace_event_stop(q->trace, HTP_TRACE_EVT_DMA, q->pop_idx);
 
     dptr = q->dptr[q->pop_idx];
 
diff --git a/ggml/src/ggml-hexagon/htp/hex-profile.h b/ggml/src/ggml-hexagon/htp/hex-profile.h
new file mode 100644
index 000000000..8a37a4a06
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hex-profile.h
@@ -0,0 +1,64 @@
+#ifndef HEX_PROFILE_H
+#define HEX_PROFILE_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <qurt.h>
+
+#include "hex-utils.h"
+#include "htp-ops.h"
+
+#define HTP_TRACE_EVT_START 0
+#define HTP_TRACE_EVT_STOP  1
+
+#ifndef HEX_NUM_PMU_COUNTERS
+#define HEX_NUM_PMU_COUNTERS 8
+#endif
+
+static inline void hex_get_pmu(uint32_t counters[]) {
+#if __HVX_ARCH__ >= 79
+    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
+    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
+    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
+    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
+    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
+    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
+    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
+    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
+#else
+    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
+    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
+    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
+    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
+    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
+    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
+    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
+    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
+#endif
+}
+
+struct htp_thread_trace {
+    uint32_t count;
+    uint32_t max_events;
+    struct htp_trace_desc * events;
+};
+
+static inline void htp_trace_event(struct htp_thread_trace * tr, uint16_t id, uint16_t info, uint32_t type) {
+    if (tr && tr->events && tr->count < tr->max_events) {
+        uint32_t idx = tr->count;
+        tr->events[idx].id = id;
+        tr->events[idx].info = info | (type == HTP_TRACE_EVT_STOP ? 0x8000 : 0);
+        tr->events[idx].cycles = (uint32_t) hex_get_cycles();
+        tr->count++;
+    }
+}
+
+static inline void htp_trace_event_start(struct htp_thread_trace * tr, uint16_t id, uint16_t info) {
+    htp_trace_event(tr, id, info, HTP_TRACE_EVT_START);
+}
+
+static inline void htp_trace_event_stop(struct htp_thread_trace * tr, uint16_t id, uint16_t info) {
+    htp_trace_event(tr, id, info, HTP_TRACE_EVT_STOP);
+}
+
+#endif /* HEX_PROFILE_H */
diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h
index 6239ceff4..8e6e3ea75 100644
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -107,31 +107,4 @@ static inline void hex_pause() {
     asm volatile(" pause(#255)\n");
 }
 
-#ifndef HEX_NUM_PMU_COUNTERS
-#define HEX_NUM_PMU_COUNTERS 8
-#endif
-
-static inline void hex_get_pmu(uint32_t counters[]) {
-#if __HVX_ARCH__ >= 79
-    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
-    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
-    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
-    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
-    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
-    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
-    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
-    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
-#else
-    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
-    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
-    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
-    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
-    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
-    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
-    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
-    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
-    // qurt_pmu_get_pmucnt(counters);
-#endif
-}
-
 #endif /* HEX_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
index 2796564fb..986dde148 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -18,7 +18,7 @@
 #include "ggml-common.h"
 #include "hex-dma.h"
 #include "hex-fastdiv.h"
-#include "hmx-profile.h"
+#include "hex-profile.h"
 #include "hmx-queue.h"
 #include "hmx-utils.h"
 #include "htp-ctx.h"
@@ -367,8 +367,11 @@ static void fa_k_interleave_thread(unsigned int n, unsigned int i, void * data)
         return;
     }
 
+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
     hmx_interleave_rows_to_tiles(factx->vtcm_k_tiles, factx->vtcm_k_fp16[args->buf_idx], total_rows, (int) factx->DK,
                              (int) args->src_stride, start, end);
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }
 
 static void fa_phase_k_interleave(struct hmx_fa_context * factx, int kv_rows, size_t src_stride, size_t buf_idx) {
@@ -408,8 +411,11 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
 
     __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
 
+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
     hmx_interleave_cols_to_tiles(v_tiles_dest, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV,
                              (int) args->src_stride, (int) args->n_col_tiles, start, end);
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }
 
 static void fa_phase_v_interleave(struct hmx_fa_context * factx,
@@ -462,6 +468,9 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) {
         return;
     }
 
+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
+
     const struct htp_tensor * q       = args->q;
     const uint32_t            q_start = args->q_start;
     const uint32_t            kv_head = args->kv_head;
@@ -515,6 +524,7 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) {
             }
         }
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }
 
 static void fa_phase_q_load(struct hmx_fa_context *   factx,
@@ -566,6 +576,9 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) {
         return;
     }
 
+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
+
     const struct htp_tensor * dst        = args->dst;
     const __fp16 *            o_tile_src = args->o_tile_src;
     const uint32_t            q_start    = args->q_start;
@@ -611,6 +624,7 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) {
             }
         }
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start);
 }
 
 static void fa_phase_o_store(struct hmx_fa_context *   factx,
@@ -680,6 +694,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
         return;
     }
 
+    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, vec_start);
+
     // Per-thread row scratch: thread i uses bufs at offset i * 2 * stride
     const size_t row_buf_stride = factx->row_buf_stride;
     HVX_Vector * my_row_buf0    = factx->vtcm_row_bufs + i * 2 * row_buf_stride;
@@ -950,6 +967,7 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
         factx->vtcm_s_rowmax[r_vec_idx] = rowmax_acc_v;
         factx->vtcm_p_rowsum[r_vec_idx] = rowsum_acc_v;
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, vec_start);
 }
 
 // Serial m/l update + build_D.  Must run after softmax barrier (s_rowmax written by all threads).
@@ -1245,6 +1263,7 @@ static __attribute__((noinline)) void fa_compute_slopes(
 // ============================================================================
 
 int hmx_flash_attn_ext(struct htp_ops_context * octx) {
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[HTP_MAX_NTHREADS] : NULL;
     const struct htp_tensor * q    = octx->src[0];
     const struct htp_tensor * k    = octx->src[1];
     const struct htp_tensor * v    = octx->src[2];
@@ -1422,19 +1441,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    // Profiling timers
-    TIMER_DEFINE(total);
-    TIMER_DEFINE(q_load);
-    TIMER_DEFINE(kv_dma);
-    TIMER_DEFINE(k_interleave);
-    TIMER_DEFINE(v_interleave);
-    TIMER_DEFINE(qk_dot);
-    TIMER_DEFINE(softmax);
-    TIMER_DEFINE(o_update);
-    TIMER_DEFINE(o_norm);
-    TIMER_DEFINE(o_store);
-
-    TIMER_START(total);
 
     // ======== DMA setup ========
     dma_queue * const dma = ctx->dma[0];
@@ -1474,12 +1480,10 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                 const size_t   n_row_tiles = g_br_actual / HMX_FP16_TILE_N_ROWS;
 
                 // ---- Load Q block [g_br, D] -> tiles, interleaving G heads ----
-                TIMER_START(q_load);
                 if (n_rows_g < g_br) {
                     hvx_splat_u8_a(factx.vtcm_q_tiles, 0, q_tile_bytes);
                 }
                 fa_phase_q_load(&factx, q, q_start, kv_head, ib3, n_rows_g);
-                TIMER_STOP(q_load);
 
                 // ---- Initialize per-block state ----
                 hvx_splat_u8_a(factx.vtcm_l_vec,   0,      col_vec_bytes);
@@ -1558,10 +1562,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         const size_t   n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS);
 
                         // Wait for current KV DMA
-                        TIMER_START(kv_dma);
                         dma_queue_pop(dma);  // K
                         dma_queue_pop(dma);  // V
-                        TIMER_STOP(kv_dma);
 
                         // Push mask DMA for this block (single 2D DMA when broadcast)
                         bool has_mask_dma = false;
@@ -1583,10 +1585,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                             ou_job.DV               = DV;
                             hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job));
                         }
-
-                        TIMER_START(k_interleave);
                         fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx);
-                        TIMER_STOP(k_interleave);
 
                         // ---- Phase 2: qk_dot(blk) on HMX ‖ V_int(blk) + DMA prefetch on HVX ----
                         qk_job.q_tiles        = factx.vtcm_q_tiles;
@@ -1597,15 +1596,11 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         qk_job.n_dot_tiles    = DK / 32;
                         qk_job.n_tiles_per_bc = n_tiles_per_bc;
                         qk_job.hmx_scales     = factx.vtcm_hmx_scales_qk;
-                        TIMER_START(qk_dot);
                         hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_qk_dot_worker, &qk_job));
 
                         // DMA push next block (non-blocking, before worker_pool)
                         DMA_PREFETCH_KV(kv_blk + 1);
-
-                        TIMER_START(v_interleave);
                         fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc);
-                        TIMER_STOP(v_interleave);
 
                         // Pop and swap previous block's output update (deferred HMX pop)
                         if (kv_blk > 0) {
@@ -1615,7 +1610,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
 
                         // Pop current block's dot product job
                         hmx_queue_pop(hmx_q);
-                        TIMER_STOP(qk_dot);
 
                         // ---- Phase 3: softmax(blk) + build_D(blk) | HMX idle ----
                         // Pop mask DMA before softmax (ensures VTCM buffer is ready)
@@ -1641,10 +1635,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         sargs.mask_vtcm            = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
                         sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
                         sargs.slopes               = factx.vtcm_slopes;
-
-                        TIMER_START(softmax);
                         fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
-                        TIMER_STOP(softmax);
 
                         buf_idx = 1 - buf_idx;
                     }  // end KV block loop (pipeline)
@@ -1664,11 +1655,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         ou_job.n_row_tiles_g_br = n_row_tiles_g_br;
                         ou_job.n_tiles_per_bc   = n_tiles_per_bc;
                         ou_job.DV               = DV;
-
-                        TIMER_START(o_update);
                         hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job));
                         hmx_queue_pop(hmx_q);
-                        TIMER_STOP(o_update);
 
                         hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
                     }
@@ -1683,23 +1671,14 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         const uint32_t kv_start    = kv_blk * Bc;
                         const uint32_t kv_rows     = hex_smin(Bc, nek1 - kv_start);
                         const size_t   n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS);
-
-                        TIMER_START(kv_dma);
                         dma_queue_pop(dma);  // K
                         dma_queue_pop(dma);  // V
-                        TIMER_STOP(kv_dma);
 
                         bool has_mask_dma = false;
                         MASK_DMA_PUSH(kv_start, kv_rows, has_mask_dma);
                         DMA_PREFETCH_KV(kv_blk + 1);
-
-                        // K interleave (multi-thread HVX)
-                        TIMER_START(k_interleave);
                         fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx);
-                        TIMER_STOP(k_interleave);
 
-                        // QK dot (inline HMX on main thread)
-                        TIMER_START(qk_dot);
                         {
                             const size_t n_dot_tiles       = (size_t) (DK / 32);
                             const __fp16 * restrict q_base = factx.vtcm_q_tiles;
@@ -1709,6 +1688,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                             __builtin_assume(n_col_tiles > 0);
                             __builtin_assume(n_dot_tiles > 0);
 
+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                             Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_qk);
                             for (size_t r = 0; r < n_row_tiles; ++r) {
                                 for (size_t c = 0; c < n_col_tiles; ++c) {
@@ -1724,8 +1704,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                     Q6_mxmem_AR_after_hf(out_tile, 0);
                                 }
                             }
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                         }
-                        TIMER_STOP(qk_dot);
 
                         // Pop mask DMA
                         MASK_DMA_POP(has_mask_dma);
@@ -1751,21 +1731,9 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         sargs.mask_vtcm            = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL;
                         sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride;
                         sargs.slopes               = factx.vtcm_slopes;
-
-                        TIMER_START(softmax);
                         fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br);
-                        TIMER_STOP(softmax);
-
-                        // V interleave (multi-thread HVX)
-                        TIMER_START(v_interleave);
-                        // FIX(v-stride): use n_tiles_per_bc (block-invariant) as V tile layout
-                        // stride to match o_update's v_tile access.  Using per-block n_col_tiles
-                        // misplaces DV_tile 1..3 in the last partial KV block.
                         fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc);
-                        TIMER_STOP(v_interleave);
 
-                        // O update (inline HMX on main thread)
-                        TIMER_START(o_update);
                         {
                             const size_t DV_tiles           = (size_t) (DV / 32);
                             const __fp16 * restrict d_base  = factx.vtcm_d_tiles;
@@ -1777,6 +1745,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                             __builtin_assume(n_col_tiles > 0);
                             __builtin_assume(DV_tiles > 0);
 
+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                             Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id);
                             for (size_t r = 0; r < n_row_tiles; ++r) {
                                 for (size_t c = 0; c < DV_tiles; ++c) {
@@ -1798,16 +1767,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                     Q6_mxmem_AR_after_hf(o_tile_out, 0);
                                 }
                             }
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                             hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev);
                         }
-                        TIMER_STOP(o_update);
 
                         buf_idx = 1 - buf_idx;
                     }  // end KV block loop (fallback)
                 }
 
                 // ---- Final normalization: O = diag(1/l) @ O ----
-                TIMER_START(o_norm);
                 {
                     fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);
 
@@ -1830,6 +1798,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                         __builtin_assume(n_row_tiles > 0);
                         __builtin_assume(DV_tiles > 0);
 
+                        htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                         Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id);
                         for (size_t r = 0; r < n_row_tiles; ++r) {
                             for (size_t c = 0; c < DV_tiles; ++c) {
@@ -1842,14 +1811,12 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                                 Q6_mxmem_AR_after_hf(o_out, 0);
                             }
                         }
+                        htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                     }
                 }
-                TIMER_STOP(o_norm);
 
                 // ---- Store O block ----
-                TIMER_START(o_store);
                 fa_phase_o_store(&factx, dst, o_tile_curr, q_start, kv_head, ib3, n_rows_g);
-                TIMER_STOP(o_store);
 
 #undef MASK_DMA_PUSH
 #undef MASK_DMA_POP
@@ -1865,14 +1832,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
         HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
     }
 
-    TIMER_STOP(total);
 
-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "hmx-fa: %lld us, q_load=%lld kv_dma=%lld k_interleave=%lld v_interleave=%lld", TIMER_US(total),
-         TIMER_US(q_load), TIMER_US(kv_dma), TIMER_US(k_interleave), TIMER_US(v_interleave));
-    FARF(HIGH, "  qk_dot=%lld softmax=%lld o_update=%lld o_norm=%lld o_store=%lld", TIMER_US(qk_dot), TIMER_US(softmax),
-         TIMER_US(o_update), TIMER_US(o_norm), TIMER_US(o_store));
-#endif
 
     return HTP_STATUS_OK;
 }
diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index dab605210..5c37f24ff 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -27,7 +27,7 @@
 #include "hmx-ops.h"
 #include "hmx-utils.h"
 #include "hmx-queue.h"
-#include "hmx-profile.h"
+#include "hex-profile.h"
 
 #include "vtcm-utils.h"
 
@@ -430,6 +430,7 @@ typedef struct {
     int                      n_tasks;
     int                      n_k_tiles;
     struct fastdiv_values    n_k_tiles_div;
+    struct htp_thread_trace * traces;
 } x4x2_dequantize_state_t;
 
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
@@ -533,11 +534,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(
                                                                                                                \
 static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) {                 \
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;                                          \
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;                                   \
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);                                                 \
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {                     \
         int start = task_id * state->n_tiles_per_task;                                                         \
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);                             \
         dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end);                                 \
     }                                                                                                          \
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);                                                 \
 }
 
 DEFINE_DEQUANTIZE_Q4_TASK(q4_0,   q4_0_to_fp16_lut,   q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
@@ -657,11 +661,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(
 
 static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
         dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }
 
 static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
@@ -717,11 +724,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
 
 static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
         dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }
 
 static void convert_f16_weight_to_fp16_tiles_task(
@@ -773,11 +783,14 @@ static void convert_f16_weight_to_fp16_tiles_task(
 
 static void convert_f16_worker_loop(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
         convert_f16_weight_to_fp16_tiles_task(state, start, end);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }
 
 static void quantize_f32_weight_to_fp16_tiles_task(
@@ -833,11 +846,14 @@ static void quantize_f32_weight_to_fp16_tiles_task(
 
 static void quantize_f32_worker_loop(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
         quantize_f32_weight_to_fp16_tiles_task(state, start, end);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i);
 }
 
 
@@ -868,6 +884,7 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
     state.weight_type      = weight_type;
     state.n_k_tiles        = n_k_tiles;
     state.n_k_tiles_div    = n_k_tiles_div;
+    state.traces           = ctx ? ctx->trace : NULL;
 
     if (state.n_tasks == 1 || n_threads == 1) {
         dequant_worker_fn(1, 0, &state);
@@ -985,10 +1002,13 @@ typedef struct {
     int            n_chunks_per_task;
     int            n_cols;
     int            n;  // DDR row stride (total output columns)
+    struct htp_thread_trace * traces;
 } output_transfer_task_state_t;
 
 static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) {
     output_transfer_task_state_t *st = (output_transfer_task_state_t *) data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
 
     for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) {
         int    chunk_idx  = task_id * st->n_chunks_per_task;
@@ -998,6 +1018,7 @@ static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void
         const __fp16 *vtcm_src = st->vtcm_src + chunk_idx * st->n_cols;
         transfer_output_chunk_fp16_to_fp32(dst, vtcm_src, chunk_size, st->n_cols, st->n);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
 }
 
 static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src,
@@ -1015,6 +1036,7 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst,
     state.vtcm_src          = vtcm_src;
     state.n_cols            = n_cols;
     state.n                 = n;
+    state.traces            = ctx ? ctx->trace : NULL;
 
     if (state.n_tasks == 1 || n_threads == 1) {
         transfer_output_chunk_worker_fn(1, 0, &state);
@@ -1086,10 +1108,13 @@ typedef struct {
     int          n_chunks_per_task;
     int          k_block;
     int          k_stride;
+    struct htp_thread_trace * traces;
 } activation_transfer_task_state_t;
 
 static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) {
     activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
 
     for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) {
         // one chunk: one row
@@ -1100,6 +1125,7 @@ static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i,
         const float *src = st->src + chunk_idx * st->k_stride;
         transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
 }
 
 static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride, int n_threads) {
@@ -1117,6 +1143,7 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *
     state.src               = src;
     state.k_block           = k_block;
     state.k_stride          = k_stride;
+    state.traces            = ctx ? ctx->trace : NULL;
 
     if (state.n_tasks == 1 || n_threads == 1) {
         transfer_activation_chunk_worker_fn(1, 0, &state);
@@ -1245,13 +1272,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
     FARF(HIGH, "hmx-mm-2d: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu",
          m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget);
 
-    TIMER_DEFINE(activation_load);
-    TIMER_DEFINE(weight_load);
-    TIMER_DEFINE(hmx_core);
-    TIMER_DEFINE(output_store);
 
-    TIMER_DEFINE(total);
-    TIMER_START(total);
 
     int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
 
@@ -1370,7 +1391,12 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
                 dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads);
 
                 // C: HMX Compute (Synchronous)
-                core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                {
+                    struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                    htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                    core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                    htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                }
 
                 // D: Output Store
                 float *output_chunk = dst + (mr * n + nc);
@@ -1380,18 +1406,7 @@ int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float
         HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
     }
 
-    TIMER_STOP(total);
 
-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "hex-mm-2d: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n);
-    if (!use_pipeline) {
-        FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
-             TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
-        size_t weight_size = (size_t)n * row_stride;
-        float  bandwidth   = 1e-3f * weight_size / (float)TIMER_US(weight_load);
-        FARF(HIGH, "  weight load bandwidth: %.2f GB/s", bandwidth);
-    }
-#endif
 
     return 0;
 }
@@ -1523,13 +1538,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
             m_chunk_n_rows, n_chunk_n_cols,
             (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
 
-    TIMER_DEFINE(activation_load);
-    TIMER_DEFINE(weight_load);
-    TIMER_DEFINE(hmx_core);
-    TIMER_DEFINE(output_store);
-    TIMER_DEFINE(total);
 
-    TIMER_START(total);
 
     const size_t fp16_row_bytes   = (size_t) params->k * sizeof(__fp16);
     const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16);
@@ -1549,7 +1558,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                 // contiguous rows into a VTCM scratch buffer first, then HVX
                 // converts from the contiguous VTCM buffer.  This avoids L2 cache
                 // thrashing from HVX loads at large strides.
-                TIMER_START(activation_load);
                 for (int g = 0; g < group_size; ++g) {
                     const float *activation_chunk = hmx_matmul_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride;
                     __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
@@ -1569,7 +1577,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                                                               params->k, params->act_stride, ctx->n_threads);
                     }
                 }
-                TIMER_STOP(activation_load);
 
                 void *buf_curr = vtcm_scratch0;
                 void *buf_next = vtcm_scratch1;
@@ -1584,7 +1591,6 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                     const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols);
                     const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);
 
-                    TIMER_START(weight_load);
                     {
                         dma_queue_pop(ctx->dma[0]);
 
@@ -1601,24 +1607,22 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
                                                  0, n_cols);
                         hex_swap_ptr(&buf_curr, &buf_next);
                     }
-                    TIMER_STOP(weight_load);
 
                     // Reuse the interleaved weight for every q_head in this GQA group
                     for (int g = 0; g < group_size; ++g) {
-                        TIMER_START(hmx_core);
                         {
                             const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
+                            struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                            htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                             core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles,
                                                 params->k / 32);
+                            htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
                         }
-                        TIMER_STOP(hmx_core);
 
-                        TIMER_START(output_store);
                         {
                             float *output = hmx_matmul_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc;
                             transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride, ctx->n_threads);
                         }
-                        TIMER_STOP(output_store);
                     }
                 }
             }
@@ -1627,14 +1631,7 @@ int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32
 
     HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
 
-    TIMER_STOP(total);
 
-#if defined(ENABLE_PROFILE_TIMERS)
-    FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d group=%d", __func__, TIMER_US(total),
-         params->m, params->k, params->n, group_size);
-    FARF(HIGH, "  activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us",
-         TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store));
-#endif
 
     return 0;
 }
@@ -1668,6 +1665,7 @@ typedef struct {
     size_t                          nb12;
     int                             start_row;
     int                             cne1;
+    struct htp_thread_trace        *traces;
 } activation_transfer_gathered_task_state_t;
 
 typedef struct {
@@ -1684,6 +1682,7 @@ typedef struct {
     size_t                          dst_nb2;
     int                             start_row;
     int                             cne1;
+    struct htp_thread_trace        *traces;
 } output_transfer_scattered_task_state_t;
 
 static void transfer_activation_chunk_fp32_to_fp16_gathered(
@@ -1780,6 +1779,9 @@ static void transfer_activation_chunk_fp32_to_fp16_gathered(
 
 static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigned int i, void *data) {
     activation_transfer_gathered_task_state_t *st = data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
+
     int chunk_idx = i;
     int chunk_size = st->n_chunks_per_task;
     int start_row = st->start_row + chunk_idx * chunk_size;
@@ -1791,6 +1793,7 @@ static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigne
             st->matrix_rows, st->cur_a, st->mapping_stride,
             st->ne11, &st->ne11_div, st->nb11, st->nb12, st->cne1);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, i);
 }
 
 static void transfer_activation_chunk_gathered_threaded(
@@ -1830,6 +1833,7 @@ static void transfer_activation_chunk_gathered_threaded(
         .nb12              = nb12,
         .start_row         = start_row,
         .cne1              = cne1,
+        .traces            = ctx ? ctx->trace : NULL,
     };
 
     if (actual_threads <= 1) {
@@ -1895,6 +1899,9 @@ static void transfer_output_chunk_fp16_to_fp32_scattered(
 
 static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned int i, void *data) {
     output_transfer_scattered_task_state_t *st = data;
+    struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL;
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
+
     int chunk_idx = i;
     int chunk_size = st->n_chunks_per_task;
     int start_row = st->start_row + chunk_idx * chunk_size;
@@ -1906,6 +1913,7 @@ static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned i
             st->matrix_rows, st->cur_a, st->mapping_stride,
             st->dst_nb1, st->dst_nb2, st->cne1);
     }
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, i);
 }
 
 static void transfer_output_chunk_scattered_threaded(
@@ -1942,6 +1950,7 @@ static void transfer_output_chunk_scattered_threaded(
         .dst_nb2           = dst_nb2,
         .start_row         = start_row,
         .cne1              = cne1,
+        .traces            = ctx ? ctx->trace : NULL,
     };
 
     if (actual_threads <= 1) {
@@ -2053,7 +2062,12 @@ int hmx_matmul_id_2d_f32(struct htp_context *ctx,
 
             dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads);
 
-            core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+            {
+                struct htp_thread_trace * tr = ctx ? &ctx->trace[HTP_MAX_NTHREADS] : NULL;
+                htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+                core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS);
+                htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS);
+            }
 
             transfer_output_chunk_scattered_threaded(
                 ctx, dst, vtcm_output, (int) mr, (int) n_rows, (int) n_cols,
diff --git a/ggml/src/ggml-hexagon/htp/hmx-profile.h b/ggml/src/ggml-hexagon/htp/hmx-profile.h
deleted file mode 100644
index 01eece720..000000000
--- a/ggml/src/ggml-hexagon/htp/hmx-profile.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Conditional fine-grained profiling macros for HMX operations.
-//
-// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this
-// header) to instrument sub-operation latencies with HAP qtimer.  When the
-// macro is not defined the TIMER_* helpers expand to nothing so there is zero
-// overhead.
-//
-// Usage:
-//   TIMER_DEFINE(my_phase);          // declare accumulator variable
-//   TIMER_START(my_phase);           // snapshot start time
-//   ... work ...
-//   TIMER_STOP(my_phase);            // accumulate elapsed ticks
-//   FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase));
-
-#ifndef HMX_PROFILE_H
-#define HMX_PROFILE_H
-
-#include <HAP_perf.h>
-
-// #define ENABLE_PROFILE_TIMERS
-
-#if defined(ENABLE_PROFILE_TIMERS)
-#  define TIMER_DEFINE(name) int64_t name##_ticks = 0
-#  define TIMER_START(name)  int64_t name##_t0 = HAP_perf_get_qtimer_count()
-#  define TIMER_STOP(name)   name##_ticks += HAP_perf_get_qtimer_count() - name##_t0
-#  define TIMER_US(name)     HAP_perf_qtimer_count_to_us(name##_ticks)
-#else
-#  define TIMER_DEFINE(name)
-#  define TIMER_START(name)
-#  define TIMER_STOP(name)
-#  define TIMER_US(name)     0LL
-#endif
-
-#endif // HMX_PROFILE_H
diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.c b/ggml/src/ggml-hexagon/htp/hmx-queue.c
index 5b1d83a0c..a0007539c 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-queue.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-queue.c
@@ -44,7 +44,9 @@ static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
                 case HMX_QUEUE_SUSPEND: hmx_unlock(q);  break;
                 default:
                     hmx_lock(q);
+                    htp_trace_event_start(q->trace, HTP_TRACE_EVT_HMX_COMP, ir);
                     d->func(d->data);
+                    htp_trace_event_stop(q->trace, HTP_TRACE_EVT_HMX_COMP, ir);
                     break;
             }
 
diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.h b/ggml/src/ggml-hexagon/htp/hmx-queue.h
index 0d48c280f..83135cd91 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-queue.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-queue.h
@@ -11,6 +11,7 @@
 #include <HAP_farf.h>
 
 #include "hex-utils.h"
+#include "hex-profile.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -47,6 +48,7 @@ struct hmx_queue {
     void *           stack;
     uint32_t         hap_rctx;
     bool             hmx_locked;
+    struct htp_thread_trace * trace;
 };
 
 struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index 0f1676f07..cbb5d0878 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -4,6 +4,7 @@
 #include "hex-dma.h"
 #include "hmx-queue.h"
 #include "htp-ops.h"
+#include "hex-profile.h"
 #include "worker-pool.h"
 
 #include <assert.h>
@@ -70,6 +71,7 @@ struct htp_context {
     bool                   hmx_enabled;
     bool                   etm;
     uint32_t               profiler;
+    struct htp_thread_trace trace[HTP_MAX_NTHREADS + 1];
 
     uint8_t *              vtcm_base;
     size_t                 vtcm_size;
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index fa85bf4ca..0f4b74a93 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -146,10 +146,36 @@ struct htp_op_desc {
     uint16_t dst;                       // Output tensor index
 };
 
+#ifndef HTP_MAX_NTHREADS
+#define HTP_MAX_NTHREADS 10
+#endif
+
+#define HTP_TRACE_MAX_EVENTS 256
+
 enum htp_profiler_mode {
     HTP_PROF_DISABLED = 0,
     HTP_PROF_BASIC    = 1,
     HTP_PROF_PMU      = 2,
+    HTP_PROF_TRACE    = 3,
+};
+
+enum htp_trace_event_id {
+    HTP_TRACE_EVT_DMA                 = 0,
+
+    HTP_TRACE_EVT_HVX_COMP            = 20,
+    HTP_TRACE_EVT_HVX_A_QUANT         = 21,
+    HTP_TRACE_EVT_HVX_A_PREP          = 22,
+    HTP_TRACE_EVT_HVX_W_DEQUANT       = 23,
+    HTP_TRACE_EVT_HVX_W_PREP          = 24,
+    HTP_TRACE_EVT_HVX_O_PROC          = 25,
+
+    HTP_TRACE_EVT_HMX_COMP            = 40,
+};
+
+struct htp_trace_desc {
+    uint32_t cycles;  // lower 32-bits of cycle counter
+    uint16_t id;      // Event ID
+    uint16_t info;    // bit 15: is_stop. bits 14-0: tile/chunk index or other metadata.
 };
 
 #define HTP_PROF_PMU_NCNT 8
@@ -158,8 +184,8 @@ enum htp_profiler_mode {
 struct htp_prof_desc {
     uint32_t opcode;                 // GGML/HTP Op
     uint32_t usecs;                  // Number of usec
-    uint32_t cycles;                 // Number of cycles
-    uint32_t pad;                    // Unused
+    uint32_t cycles_start;           // Start cycle counter
+    uint32_t cycles_stop;            // Stop cycle counter
     uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
 };
 
@@ -168,7 +194,7 @@ struct htp_opbatch_req {
     uint32_t n_bufs;      // Number of buffers
     uint32_t n_tensors;   // Number of tensors
     uint32_t n_ops;       // Number of ops
-    uint32_t flags;       // unused
+    uint32_t n_traces;    // Number of trace descriptors per thread
     uint32_t pad;         // unused
     // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
     // struct htp_tensor    tensors[]; -- dspqueue buf 0
@@ -181,7 +207,8 @@ struct htp_opbatch_rsp {
     uint32_t n_bufs;     // Number of buffers
     uint32_t n_tensors;  // Number of tensors
     uint32_t n_ops;      // Number of op profile descriptors
-    uint32_t pad;        // unused
+    uint32_t n_traces[HTP_MAX_NTHREADS + 1];
+    uint8_t  pad[8];     // align to 8 bytes
     // struct htp_prof_desc profs[];  -- dspqueue buf 0
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 3715227d2..53ab33c07 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -400,7 +400,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
     ctx->hmx_queue   = NULL;
     if (use_hmx) {
         ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
-        if (!ctx->hmx_queue) {
+        if (ctx->hmx_queue) {
+            ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
+        } else {
             FARF(ERROR, "hmx-queue-create failed");
             ctx->hmx_enabled = false;
         }
@@ -425,6 +427,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
     ctx->n_threads = n_hvx;
     for (int i = 0; i < ctx->n_threads; i++) {
         ctx->dma[i] = dma_queue_create(256); // queue depth
+        if (ctx->dma[i]) {
+            ctx->dma[i]->trace = &ctx->trace[i];
+        }
     }
 
     ctx->ddr_spad_size = 512 * 1024; // 512 KB
@@ -502,7 +507,8 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 
 struct profile_data {
     uint64_t usecs;
-    uint64_t cycles;
+    uint64_t cycles_start;
+    uint64_t cycles_stop;
     uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
 };
 
@@ -512,8 +518,9 @@ static inline void profile_start(uint32_t mode, struct profile_data * d) {
             hex_get_pmu(d->pmu_counters);
             // fallthrough
         case HTP_PROF_BASIC:
+        case HTP_PROF_TRACE:
             d->usecs  = HAP_perf_get_qtimer_count();
-            d->cycles = hex_get_cycles();
+            d->cycles_start = hex_get_cycles();
             break;
         default:
             break;
@@ -530,8 +537,9 @@ static inline void profile_stop(uint32_t mode, struct profile_data * d) {
             }
             // fallthrough
         case HTP_PROF_BASIC:
+        case HTP_PROF_TRACE:
             d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-            d->cycles = hex_get_cycles() - d->cycles;
+            d->cycles_stop = hex_get_cycles();
             break;
         default:
             break;
@@ -845,14 +853,15 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
         const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
         const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
         const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
+        const uint32_t tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(struct htp_trace_desc);
 
-        if (dbuf.size < b_size + t_size + o_size + p_size) {
-            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
+        if (dbuf.size < b_size + t_size + o_size + p_size + tr_size) {
+            FARF(ERROR, "invalid opbatch memory block size %u (req %u)", dbuf.size, b_size + t_size + o_size + p_size + tr_size);
             break;
         }
 
-        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
-                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
+        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u n-traces %u : m-size %u b-size %u t-size %u o-size %u", req.id,
+                n_bufs, n_tens, n_ops, req.n_traces, dbuf.size, b_size, t_size, o_size);
 
         // Setup descriptor pointers
         uint8_t * m_ptr = dbuf.ptr;
@@ -869,6 +878,20 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
         octx->n_threads = ctx->n_threads;
         octx->ctx       = ctx;
 
+        if (ctx->profiler == HTP_PROF_TRACE) {
+            memset(ctx->trace, 0, sizeof(ctx->trace));
+            struct htp_trace_desc * trace_events = (struct htp_trace_desc *) (m_ptr + p_size);
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                ctx->trace[t].events = &trace_events[t * req.n_traces];
+                ctx->trace[t].max_events = req.n_traces;
+            }
+        } else {
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                ctx->trace[t].events = NULL;
+                ctx->trace[t].max_events = 0;
+            }
+        }
+
         for (uint32_t i=0; i < n_ops; i++) {
             struct profile_data prof;
 
@@ -886,7 +909,8 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
             if (ctx->profiler) {
                 pds[i].opcode = ops[i].opcode;
                 pds[i].usecs  = prof.usecs;
-                pds[i].cycles = prof.cycles;
+                pds[i].cycles_start = prof.cycles_start;
+                pds[i].cycles_stop = prof.cycles_stop;
                 for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
                     pds[i].pmu[j] = prof.pmu_counters[j];
                 }
@@ -899,6 +923,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
         rsp.n_bufs    = n_bufs;
         rsp.n_tensors = n_tens;
         rsp.n_ops     = n_ops;
+        memset(rsp.pad, 0, sizeof(rsp.pad));
+        if (ctx->profiler == HTP_PROF_TRACE) {
+            for (int t = 0; t <= HTP_MAX_NTHREADS; t++) {
+                rsp.n_traces[t] = ctx->trace[t].count;
+            }
+        } else {
+            memset(rsp.n_traces, 0, sizeof(rsp.n_traces));
+        }
 
         dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
 
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 5121c6f9b..8e016c1be 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -3350,6 +3350,7 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *
 
 static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
@@ -3411,10 +3412,12 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
                 float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
                 const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
+                htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, iir0);
                 for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
                     const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
                     mmctx->vec_dot_1x1(ne00, &dst_col[ir0], src0_row, src1_col);
                 }
+                htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, iir0);
             }
         }
     }
@@ -3430,6 +3433,7 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
 // src1 tensor is already in VTCM spad
 static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
     const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
@@ -3477,6 +3481,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
+        htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
         // Process src1 columns in pairs (2×2 tiling)
         uint32_t ir1 = 0;
         for (; ir1 + 1 < src1_nrows; ir1 += 2) {
@@ -3494,6 +3500,8 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
             mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col);
         }
 
+        htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
+
         // Prefetch next (n + spad_nrows) row
         const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
         const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
@@ -3511,12 +3519,14 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
                        src0_stride, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
+        htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
         #pragma unroll(2)
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
             mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
         }
+        htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
     }
 
     t2 = HAP_perf_get_qtimer_count();
@@ -3530,6 +3540,7 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
 // q8x4x2 src1 tensor is already in VTCM spad
 static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const uint32_t src0_nrows = ne01;
 
@@ -3581,7 +3592,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
         // Process src0 rows
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) {
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 
             // Prefetch next (n + spad_nrows) row
             const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3599,7 +3612,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                            src0_stride, src0_row_size, 2);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             ir0 += 2;
         }
         if (ir0 < src0_end_row) {
@@ -3607,7 +3622,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                            src0_stride, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             ir0 += 1;
         }
     } else {
@@ -3627,7 +3644,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
         // Process src0 rows
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 
             // Prefetch next (n + spad_nrows) row
             const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3645,7 +3664,9 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
                            src0_stride, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
         }
     }
 
@@ -3669,6 +3690,7 @@ struct mmid_row_mapping {
 // src1 tensor is already in VTCM spad
 static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * restrict ids = octx->src[2];
     struct htp_spad * restrict   src2_spad = &octx->src2_spad;
@@ -3735,6 +3757,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             for (uint32_t cid = 0; cid < cne1; ++cid) {
                 struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
                 const int               rm1         = row_mapping.i1;  // expert idx
@@ -3746,6 +3769,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
 
                 mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
             }
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 
             // Prefetch next (n + spad_nrows) row
             const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3764,6 +3788,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
                            src0_row_size_padded, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
 
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             for (uint32_t cid = 0; cid < cne1; ++cid) {
                 struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
                 const int               rm1         = row_mapping.i1;  // expert idx
@@ -3775,6 +3800,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
 
                 mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
             }
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
         }
     }
 
@@ -3789,6 +3815,7 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
 // src1 tensor is already in VTCM spad
 static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
     htp_matmul_preamble;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * restrict ids = octx->src[2];
     struct htp_spad * restrict   src2_spad = &octx->src2_spad;
@@ -3847,7 +3874,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
         // Process src0 rows
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
 
             // Prefetch next (n + spad_nrows) row
             const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
@@ -3865,7 +3894,9 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
                            src0_row_size_padded, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
+            htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
             mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
+            htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0);
         }
     }
 
@@ -4147,6 +4178,7 @@ static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, ui
 static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * src = octx->src[1];
     uint8_t * restrict dst = octx->src1_spad.data;
@@ -4163,6 +4195,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)
     const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
 
     const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
     const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
 
     const size_t src_row_size = src->nb[1];
@@ -4189,6 +4222,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data)
 
     FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
          ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }
 
 static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
@@ -4219,6 +4253,7 @@ static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y,
 static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * src = octx->src[1];
     uint8_t * restrict dst = octx->src1_spad.data;
@@ -4235,6 +4270,7 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat
     const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
 
     const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
     const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
 
     const size_t src_row_size = src->nb[1];
@@ -4260,11 +4296,13 @@ static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * dat
 
     FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
          ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }
 
 static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * src = octx->src[1];
     uint8_t * restrict dst = octx->src1_spad.data;
@@ -4281,6 +4319,7 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
     const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
 
     const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
     const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
 
     const size_t src_row_size = ne0 * sizeof(float);
@@ -4301,11 +4340,13 @@ static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) {
 
     FARF(HIGH, "quantize-f32-f32: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }
 
 static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * src = octx->src[1];
     uint8_t * restrict dst = octx->src1_spad.data;
@@ -4322,6 +4363,7 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
     const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
 
     const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
     const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
 
     const size_t src_row_size = ne0 * sizeof(float);
@@ -4342,12 +4384,14 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
 
     FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }
 
 // TODO just a plain copy that should be done via the DMA during the Op setup
 static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
     struct htp_matmul_context * mmctx = data;
     struct htp_ops_context * octx = mmctx->octx;
+    struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL;
 
     const struct htp_tensor * src = octx->src[1];
     uint8_t * restrict dst = octx->src1_spad.data;
@@ -4364,6 +4408,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
     const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
 
     const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
     const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
 
     const size_t src_row_size = ne0 * sizeof(float);
@@ -4384,6 +4429,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
 
     FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
         ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first);
 }