From 28ce072f59523b0a3a1752ceab7516e6e5d9a86d Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Fri, 10 Apr 2026 15:47:43 -0700 Subject: [PATCH] hexagon: improved Op queuing, buffer and cache management (llama/21705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * hexagon: introduce op request batching and rewrite buffer managment The host now prepares batches of requests and dispatches them via a single dspqueue message. Buffers are mapped explicitly by NPU while processing batches. * hex-dma: disable l2 bypass since to work around new issue due to no flushes between Ops * hex-utils: add explicit l2flush and l2clear helpers * hex-opreq: use fine-grain per tensor l2 management * hex-opreq: avoid redundant invalidates for tensors we already flushed * hex-opreq: update debug messages * htp-opreq: reuse ops_context * hex-opreq: do not flush or invalidate cache lines beyond buffer boundry * hex-opreq: fix errors in log message * Revert "hex-opreq: do not flush or invalidate cache lines beyond buffer boundry" This reverts commit 8b7f0a55a750a6430ce4eb1874c7feb3d720056d. * hexagon: limit l2 flushes to 1MB which covers l2 cache * hex-opreq: limit cache flush to 4MB Looks like 4MB cont. vitual space should cover the 1MB cache. * hexagon: drop cache flush size to 2MB * hex-opreq: start reworking opreq packing * hex-opreq: introduce new way of packing opbatch where tensors are stored separately * hex-opreq: add a simple fastrpc call to force unmap all buffers * hex-l2flush: somehow 2MB does not seem robust, also cleanup step size to use line-size * hex-opreq: bump opreq batch size to 256 * hex-mm: place src1 spad at the top of vtcm for easy reuse * hex-ops: introduce internal types and disable src1 reuse for now Nothing new just formalizing the repack / qyn.quant types we've been using. * htp-opreq: use tensor pointers instead of copies * hex-opreq: introduce more robust way for tracking vtcm/spad reuse This removes the SKIP_QUANTIZE flag that became fragile with the addition of HMX and other ops. * hex-cumsum: fix error post opreq merge * hex-opreq: move request batch handling into the session Prepping everything for using dspqueue buffers and doing that inside the session is much cleaner. * hex-mm: yet another fix for src1 reuse when we're mixing hmx/hvx * hex-bufs: introduce pinned mmapings and use non-pinned ones for model buffers * hex-buf: add support for allocating shared/pinned buffer for opreqs * hex-opbatch: make opbatches configurable * hex-naming: better name for ggml_hexagon_shared_buffer * hex-naming: add session->c_name() helper * hex-opbatch: start using shm but still copy for now * hex-opbatch: use shared buffer for packing opbatch * hex-opbatch: beter naming for opbatch related classes and code * hex-opbatch: reuse batched tensors with same data/dims/strides * hex-opbatch: update logging * hex-opbatch: add support for vmem limit for op batching * hex-opbatch: update htp side to properly support dynamic mmap/unmap * hex-opbatch: add OB and OQ params for run-completion script and fix the asserts in batch processing * hex-opbatch: fixed src1 handling in act ops * hex-act: fix empty src1 handling in swiglu and friends Simplify preamble macro while at it * hex-mm: minor fix vtcm and dma handling in matmul cleaning up some left-overs from merges * hex-opbatch: allocate extra 1KB for dspqueue overhead * hexagon: fix softmax for non-aligned tensors and cleanup vtcm alloc * hex-mm: properly handle hmx_disabled flag * hex-ops: update comments * hex-ops: add debug output for get/set-rows * hex-mmap: optimize un/mapping of buffers * hex-opreq: global cache flush and invalidate beyond 128KB threshold * hex-ops: add super simple opfilter regex for debugging If an Op matches the regex hex backend will reject it. * hex-opbatch: wireup newer ops missed in merge and update main switch to detect this in future * hexagon: improved vtcm acquision to remove inter-op overhead Fully compatible with QNN-HTP coex * hex-mm: fixed hvx fallback path * hex-mm: lower the vmem threshold a bit further to ~3GB * hexagon: update debug & error logs This also fixes an issue with newer llvm merging repack and non-repack functions. We use those pointer to distinguish between buffer types. * hexagon: move ops context into main context Just a cleanup. We don't need separate contexts at this point. * hex-opbatch: cleanup naming and headers for opbatch and related descriptors * hex-fa: it's now better to enable FA during TG to reduce graph splits * hexagon: remove GGML_HEXAGON_EXPERIMENTAL env var It's no longer useful. Please use more flexible GGML_HEXAGON_OPFILTER to disable Ops if needed for debugging or validation. * hexagon: fixed editorconfig check * Update ggml/src/ggml-hexagon/ggml-hexagon.cpp Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Trivikram Reddy Co-authored-by: Sigbjørn Skjæret --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 1369 ++++++++---------- ggml/src/ggml-hexagon/htp/act-ops.c | 137 +- ggml/src/ggml-hexagon/htp/argsort-ops.c | 18 +- ggml/src/ggml-hexagon/htp/binary-ops.c | 46 +- ggml/src/ggml-hexagon/htp/cpy-ops.c | 10 +- ggml/src/ggml-hexagon/htp/cumsum-ops.c | 25 +- ggml/src/ggml-hexagon/htp/flash-attn-ops.c | 36 +- ggml/src/ggml-hexagon/htp/get-rows-ops.c | 74 +- ggml/src/ggml-hexagon/htp/hex-utils.h | 21 + ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 25 +- ggml/src/ggml-hexagon/htp/hmx-ops.h | 6 +- ggml/src/ggml-hexagon/htp/htp-ctx.h | 98 +- ggml/src/ggml-hexagon/htp/htp-msg.h | 166 --- ggml/src/ggml-hexagon/htp/htp-ops.h | 191 ++- ggml/src/ggml-hexagon/htp/htp_iface.idl | 2 + ggml/src/ggml-hexagon/htp/main.c | 1498 +++++--------------- ggml/src/ggml-hexagon/htp/matmul-ops.c | 231 ++- ggml/src/ggml-hexagon/htp/repeat-ops.c | 10 +- ggml/src/ggml-hexagon/htp/rope-ops.c | 31 +- ggml/src/ggml-hexagon/htp/set-rows-ops.c | 90 +- ggml/src/ggml-hexagon/htp/softmax-ops.c | 252 ++-- ggml/src/ggml-hexagon/htp/ssm-conv.c | 21 +- ggml/src/ggml-hexagon/htp/sum-rows-ops.c | 12 +- ggml/src/ggml-hexagon/htp/unary-ops.c | 12 +- 24 files changed, 1786 insertions(+), 2595 deletions(-) delete mode 100644 ggml/src/ggml-hexagon/htp/htp-msg.h diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index ac5baa2a..3d68b800 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -7,10 +7,14 @@ #include #include -#include #include +#include +#include #include #include +#include +#include +#include #ifdef _WIN32 # include @@ -33,7 +37,7 @@ #include "ggml-impl.h" #include "ggml-quants.h" #include "op-desc.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp_iface.h" #include "htp-drv.h" @@ -44,12 +48,14 @@ static int opt_etm = 0; static int opt_verbose = 0; static int opt_profile = 0; static int opt_hostbuf = 1; // hostbuf ON by default -static int opt_experimental = 0; static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only // Enable all stages by default -static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE; -static int opt_opsync = 0; // synchronous ops +static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE; +static int opt_opsync = 0; // synchronous ops +static int opt_opbatch = 1024; // max number of ops in a batch +static int opt_opqueue = 16; // max number of pending batches +static std::regex* opt_opfilter = NULL; // regex of ops to not claim #define HEX_VERBOSE(...) \ if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__) @@ -86,7 +92,7 @@ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_t op_desc desc(op); GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(), - ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags); + ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags); } static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) { @@ -94,7 +100,7 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct op_desc desc(op); GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(), - ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no"); + ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no"); } static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op, @@ -103,25 +109,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t op_desc desc(op); GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(), - ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, + ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec); } // ** backend sessions +struct ggml_hexagon_opbatch; +struct ggml_hexagon_opshm; + struct ggml_hexagon_session { - ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false); - ~ggml_hexagon_session() noexcept(true); - - void allocate(int dev_id) noexcept(false); - void release() noexcept(true); - - void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false); - void flush(); - - ggml_backend_buffer_type buffer_type = {}; - ggml_backend_buffer_type repack_buffer_type = {}; - std::string name; remote_handle64 handle; dspqueue_t queue; @@ -133,88 +130,29 @@ struct ggml_hexagon_session { bool valid_handle; bool valid_queue; bool valid_iface; - std::atomic op_pending; - uint32_t prof_usecs; - uint32_t prof_cycles; - uint32_t prof_pkts; + + std::atomic op_pending; + ggml_hexagon_opbatch *op_batch; + ggml_hexagon_opshm *op_shm; + + ggml_backend_buffer_type buffer_type = {}; + ggml_backend_buffer_type repack_buffer_type = {}; + + ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false); + ~ggml_hexagon_session() noexcept(true); + + const char* c_name() const { return name.c_str(); } + + void allocate(int dev_id) noexcept(false); + void release() noexcept(true); + + void enqueue_op(htp_op_code opcode, const ggml_tensor *op); + void flush(bool all = true); + + void flush_pending(bool all = false); + void flush_batch(); }; -void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { - // Bump pending flag (cleared in the session::flush once we get the response) - this->op_pending++; // atomic inc - - int err = dspqueue_write(this->queue, - 0, // flags - the framework will autoset this - n_bufs, // number of buffers - bufs, // buffer references - sizeof(req), // Message length - (const uint8_t *) &req, // Message - DSPQUEUE_TIMEOUT // Timeout - ); - - if (err != 0) { - GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err); - } - - if (sync) { - flush(); - } -} - -// Flush HTP response queue i.e wait for all outstanding requests to complete -void ggml_hexagon_session::flush() { - dspqueue_t q = this->queue; - - // Repeatedly read packets from the queue until it's empty. We don't - // necessarily get a separate callback for each packet, and new packets - // may arrive while we're processing the previous one. - - while (this->op_pending) { - struct htp_general_rsp rsp; - uint32_t rsp_size; - uint32_t flags; - - struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS]; - uint32_t n_bufs; - - // Read response packet from queue - int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, // Message - DSPQUEUE_TIMEOUT); // Timeout - - if (err == AEE_EEXPIRED) { - // TODO: might need to bail out if the HTP is stuck on something - continue; - } - - if (err != 0) { - GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err); - } - - // Basic sanity checks - if (rsp_size != sizeof(rsp)) { - GGML_ABORT("ggml-hex: dspcall : bad response (size)\n"); - } - - if (rsp.status != HTP_STATUS_OK) { - GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status)); - // TODO: handle errors - } - - // TODO: update profiling implementation, currently only works for opt_opsync mode - this->prof_usecs = rsp.prof_usecs; - this->prof_cycles = rsp.prof_cycles; - this->prof_pkts = rsp.prof_pkts; - - this->op_pending--; // atomic dec - } -} - // ** backend buffers struct ggml_backend_hexagon_buffer_type_context { @@ -227,82 +165,99 @@ struct ggml_backend_hexagon_buffer_type_context { std::string name; }; -struct ggml_backend_hexagon_buffer_context { - bool mmap_to(ggml_hexagon_session * s) { - HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n", - s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd, - (int) this->repack); +struct ggml_hexagon_shared_buffer { + ggml_hexagon_session * sess; + uint8_t * base; + size_t size; + int fd; + bool mapped; + bool pinned; - int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD); + void mmap(bool pinned = false) { + int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD_DELAYED); if (err != 0) { - GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", - s->domain_id, this->size, this->fd, (unsigned) err); - return false; + GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(), + sess->domain_id, this->size, this->fd, (unsigned) err); + throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)"); } - return true; - } + if (pinned) { + err = htp_iface_mmap(sess->handle, this->fd, this->size, pinned); + if (err != 0) { + GGML_LOG_ERROR("ggml-hex: %s buffer pinning failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(), + sess->domain_id, this->size, this->fd, (unsigned) err); + throw std::runtime_error("ggml-hex: htp_iface_mmap failed (see log for details)"); + } + } - bool mmap() { - if (this->mapped) { - return true; - } - if (!mmap_to(this->sess)) { - return false; - } this->mapped = true; - return true; + this->pinned = pinned; + HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n", + sess->c_name(), (void *) this->base, this->size, this->fd, pinned); } - void munmap() { - if (!this->mapped) { - return; - } + void unmap() { + if (!this->mapped) return; + + htp_iface_munmap(sess->handle, this->fd); + fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size); + + HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(), + (void *) this->base, size, this->fd); - fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size); this->mapped = false; + this->fd = -1; } - ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) { - size += 4 * 1024; // extra page for padding + void alloc(size_t size, bool pinned = false) { + if (this->base) return; - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); + this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size); if (!this->base) { - GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size); + GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size); throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)"); } this->fd = rpcmem_to_fd(this->base); if (this->fd < 0) { - GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base); - rpcmem_free(this->base); - this->base = NULL; + GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base); throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)"); } + this->size = size; - HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(), - (void *) this->base, size, this->fd, (int) repack); + HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(), + (void *) this->base, this->size, this->fd, (int) pinned); + + mmap(pinned); + } + + void free() { + if (!this->base) return; + + unmap(); + rpcmem_free(this->base); + + HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(), + (void *) this->base, size, this->fd); + + this->base = NULL; + } + + ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) { + size += 4 * 1024; // extra page for padding this->sess = sess; - this->size = size; + this->size = 0; + this->base = nullptr; + this->fd = -1; this->mapped = false; - this->repack = repack; + + alloc(size, pinned); } - ~ggml_backend_hexagon_buffer_context() { - munmap(); - if (this->base) { - rpcmem_free(this->base); - this->base = NULL; - } + ~ggml_hexagon_shared_buffer() { + free(); } - - ggml_hexagon_session * sess; // primary session - uint8_t * base; - size_t size; - int fd; - bool mapped; // mmap is done - bool repack; // repacked buffer }; static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) { @@ -310,30 +265,26 @@ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_ } static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) { - auto ctx = static_cast(buffer->context); - delete ctx; + auto sbuf = static_cast(buffer->context); + delete sbuf; } static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) { - auto ctx = static_cast(buffer->context); - return ctx->base; + auto sbuf = static_cast(buffer->context); + return sbuf->base; } static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - auto ctx = static_cast(buffer->context); - auto sess = ctx->sess; + auto sbuf = static_cast(buffer->context); + auto sess = sbuf->sess; - HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(), - tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage, - (int) ctx->repack); + HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(), + tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage); if (tensor->view_src != NULL && tensor->view_offs == 0) { - ; // nothing to do for the view - } else { - if (!ctx->mapped) { - ctx->mmap(); - } + return GGML_STATUS_SUCCESS; // nothing to do for the view } + return GGML_STATUS_SUCCESS; } @@ -1387,11 +1338,10 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, const void * data, size_t offset, size_t size) { - auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context; - auto sess = ctx->sess; + auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context; + auto sess = sbuf->sess; - HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data, - offset, size); + HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size); switch (tensor->type) { case GGML_TYPE_Q4_0: @@ -1430,11 +1380,10 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, void * data, size_t offset, size_t size) { - auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context; - auto sess = ctx->sess; + auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context; + auto sess = sbuf->sess; - HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data, - offset, size); + HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size); switch (tensor->type) { case GGML_TYPE_Q4_0: @@ -1478,10 +1427,10 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t bu } static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context; - auto sess = ctx->sess; - HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size); - memset(ctx->base, value, ctx->size); + auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context; + auto sess = sbuf->sess; + HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size); + memset(sbuf->base, value, sbuf->size); } static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = { @@ -1508,10 +1457,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( ggml_backend_buffer_type_t buffer_type, size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); - return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); + ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size); + return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size); } catch (const std::exception & exc) { - GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); + GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what()); return nullptr; } } @@ -1520,10 +1469,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe ggml_backend_buffer_type_t buffer_type, size_t size) { auto sess = static_cast(buffer_type->context)->sess; try { - ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); - return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); + ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size); + return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size); } catch (const std::exception & exc) { - GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); + GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what()); return nullptr; } } @@ -1538,7 +1487,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffe } static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { - return 1 * 1024 * 1024 * 1024; // 1GB per buffer + return 1UL * 1024 * 1024 * 1024; // 1GB per buffer GGML_UNUSED(buffer_type); } @@ -1570,6 +1519,373 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host, }; +// Backend session implementation + +struct ggml_hexagon_opshm { + ggml_hexagon_shared_buffer *sbuf; + + std::vector block_mask; + size_t block_size; + + uint8_t * base() const { return this->sbuf->base; } + int fd() const { return this->sbuf->fd; } + size_t n_blocks() const { return this->block_mask.size(); } + + ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) { + size_t n_bufs = HTP_OP_MAX_BUFS; + size_t n_ops = max_batch; + size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS; + + block_mask.resize(max_pending, true); + + block_size = sizeof(htp_buf_desc) * n_bufs + + sizeof(htp_tensor) * n_tensors + + sizeof(htp_op_desc) * n_ops; + + sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */); + + if (opt_verbose) { + GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n", + sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending); + } + } + + ~ggml_hexagon_opshm() { + delete sbuf; + } + + uint8_t * allocate() { + auto it = std::find(block_mask.begin(), block_mask.end(), true); + if (it == block_mask.end()) + return nullptr; + + unsigned int i = std::distance(block_mask.begin(), it); + uint8_t* addr = sbuf->base + (i * block_size); + block_mask[i] = false; + + HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr); + return addr; + } + + void release(uint8_t * addr) { + int i = (addr - sbuf->base) / block_size; + block_mask[i] = true; + HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr); + } +}; + +struct ggml_hexagon_opbatch { + const char* name; + + std::vector buffers; + std::vector tensors; + std::vector ops; + + std::unordered_map b_map; // buffer fd to index + std::unordered_map t_map; // tensor ptr to index + std::unordered_multimap d_map; // tensor data to index + + unsigned int n_bufs; // num buffers in the batch + unsigned int n_tens; // num tensors ... + unsigned int n_ops; // num ops ... + size_t b_vmem; // sum of all buffer sizes + + unsigned int n_bufs_max; + unsigned int n_tens_max; + unsigned int n_ops_max; + size_t b_vmem_max; + + void reset() { + n_bufs = 0; + n_tens = 0; + n_ops = 0; + b_vmem = 0; + + b_map.clear(); + t_map.clear(); + d_map.clear(); + } + + ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) { + name = sess->c_name(); + + n_bufs_max = HTP_OP_MAX_BUFS; + n_ops_max = max_batch; + n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS; + + b_vmem_max = HTP_OP_MAX_VMEM; + + buffers.resize(n_bufs_max); + tensors.resize(n_tens_max); + ops.resize(n_ops_max); + + b_map.reserve(n_bufs_max); + t_map.reserve(n_tens_max); + d_map.reserve(n_tens_max); + + reset(); + } + + bool empty() const { return n_ops == 0; } + + // add buffer and return its index + int add_buffer(ggml_hexagon_shared_buffer * sbuf) { + // Lookup by fd + auto it = b_map.find(sbuf->fd); + if (it != b_map.end()) { return it->second; } + + // Add new buffer to the batch + int bi = n_bufs++; + GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS); + + b_map.insert({sbuf->fd, bi}); + + htp_buf_desc &b = buffers[bi]; + b.base = (uint64_t) sbuf->base; + b.fd = sbuf->fd; + b.size = sbuf->size; + + b_vmem += b.size; + + HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem); + + return bi; + } + + bool same_shape(const htp_tensor * h, const ggml_tensor * t) const { + return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) && + (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]); + } + + // add tensor and return its index + int add_tensor(const ggml_tensor * t) { + auto sbuf = static_cast(t->buffer->context); + + // First lookup by tensor data + auto range = d_map.equal_range(t->data); + for (auto it = range.first; it != range.second; ++it) { + htp_tensor * h = &tensors[it->second]; + if (same_shape(h, t)) { return it->second; } + } + + // Lookup by tensor ptr + auto it = t_map.find(t); + if (it != t_map.end()) { return it->second; } + + // Add new tensor to the batch + int ti = n_tens++; + GGML_ASSERT(n_tens <= n_tens_max); + + t_map.insert({t, ti}); + d_map.insert({t->data, ti}); + + uint64_t t_offset = (uint8_t *) t->data - sbuf->base; + size_t t_size = ggml_nbytes(t); + + htp_tensor &h = tensors[ti]; + h.bi = add_buffer(sbuf); + h.data = t_offset; + h.size = t_size; + h.type = t->type; + h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3]; + h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3]; + + h.flags = 0; + if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { + h.flags |= HTP_TENSOR_COMPUTE; + } + + HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n", + ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags, + (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]); + + return ti; + } + + bool fit_op(const struct ggml_tensor *t) const { + if (n_ops >= n_ops_max ) return false; + + // check how much extras we will need + size_t extra_bufs = 0; + size_t extra_vmem = 0; + size_t extra_tens = 0; + + auto fit_tensor = [&](const ggml_tensor *t) { + if (!t_map.count(t)) { + extra_tens++; + + auto sbuf = static_cast(t->buffer->context); + if (!b_map.count(sbuf->fd)) { + extra_vmem += sbuf->size; + extra_bufs += 1; + } + } + }; + + for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) { + fit_tensor(t->src[i]); + } + fit_tensor(t); + + if ((extra_bufs + n_bufs) > n_bufs_max) return false; + if ((extra_tens + n_tens) > n_tens_max) return false; + if ((extra_vmem + b_vmem) > b_vmem_max) return false; + + return true; + } + + // assumes that fit_op() was called first and returned true + void add_op(htp_op_code opcode, const struct ggml_tensor * t) { + // Add new op + htp_op_desc &o = ops[n_ops++]; + GGML_ASSERT(n_ops <= n_ops_max); + + memcpy(&o.params, &t->op_params, sizeof(t->op_params)); + o.opcode = opcode; + o.flags = 0; + + if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { + o.flags |= HTP_OPFLAGS_SKIP_COMPUTE; + } + + ggml_hexagon_dump_op_exec(name, t, o.flags); + + for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) { + o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff; + } + o.dst = add_tensor(t); + } + + size_t flush(uint8_t * mem_addr, size_t mem_size) { + static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8"); + static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8"); + static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8"); + + const size_t b_size = sizeof(htp_buf_desc) * n_bufs; + const size_t t_size = sizeof(htp_tensor) * n_tens; + const size_t o_size = sizeof(htp_op_desc) * n_ops; + + const size_t m_size = b_size + t_size + o_size; + GGML_ASSERT(m_size <= mem_size); + + uint8_t * b_ptr = (uint8_t *) mem_addr; + uint8_t * t_ptr = (uint8_t *) b_ptr + b_size; + uint8_t * o_ptr = (uint8_t *) t_ptr + t_size; + + memcpy(b_ptr, (void *) buffers.data(), b_size); + memcpy(t_ptr, (void *) tensors.data(), t_size); + memcpy(o_ptr, (void *) ops.data(), o_size); + + HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n", + name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size); + + if (opt_verbose > 1) { + htp_buf_desc *b = (htp_buf_desc*) b_ptr; + for (unsigned int i=0; i < n_bufs; i++) { + GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i, + b[i].fd, (void *) b[i].base, (size_t) b[i].size); + } + htp_tensor *t = (htp_tensor*) t_ptr; + for (unsigned int i=0; i < n_tens; i++) { + GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n", + name, i, t[i].bi, t[i].data, t[i].size, + (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]); + } + } + + reset(); + + return m_size; + } +}; + +// Flush HTP response queue i.e wait for all outstanding requests to complete +void ggml_hexagon_session::flush_pending(bool all) { + while (this->op_pending) { + struct htp_opbatch_rsp rsp; + uint32_t rsp_size; + uint32_t flags; + + struct dspqueue_buffer dbuf; + uint32_t n_dbufs; + + // Read response packet from queue + int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT); + if (err == AEE_EEXPIRED) { + continue; + } + + if (err != 0) { + GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err); + } + + // Basic sanity checks + if (rsp_size != sizeof(rsp) || n_dbufs != 1) { + GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs); + } + + op_shm->release((uint8_t*) dbuf.ptr); + + if (rsp.status != HTP_STATUS_OK) { + GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status)); + // TODO: handle errors + } + + // FIXME: profile will be per opreq + // this->prof_usecs = rsp.prof_usecs; + // this->prof_cycles = rsp.prof_cycles; + // this->prof_pkts = rsp.prof_pkts; + + this->op_pending--; // atomic dec + + if (!all) break; + } +} + +void ggml_hexagon_session::flush_batch() { + if (op_batch->empty()) { return; } + + htp_opbatch_req req; + req.n_bufs = op_batch->n_bufs; + req.n_tensors = op_batch->n_tens; + req.n_ops = op_batch->n_ops; + + dspqueue_buffer dbuf; + dbuf.fd = op_shm->fd(); + dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; + dbuf.ptr = op_shm->allocate(); + if (!dbuf.ptr) { + flush_pending(false); + dbuf.ptr = op_shm->allocate(); + } + + dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base(); + dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size); + + // Bump pending flag (cleared in the session::flush once we get the response) + this->op_pending++; // atomic inc + + HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size); + + int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT); + if (err != 0) { + GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err); + } +} + +void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) { + if (!op_batch->fit_op(op)) { + flush_batch(); + } + op_batch->add_op(opcode, op); +} + +// Flush HTP response queue i.e wait for all outstanding requests to complete +void ggml_hexagon_session::flush(bool all) { + flush_batch(); + flush_pending(all); +} + void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->valid_session = false; this->valid_handle = false; @@ -1582,9 +1898,6 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->name = std::string("HTP") + std::to_string(dev_id); this->op_pending = 0; - this->prof_usecs = 0; - this->prof_cycles = 0; - this->prof_pkts = 0; GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str()); @@ -1676,11 +1989,14 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { } } + const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024; + const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024; + // Now let's setup the DSP queue err = dspqueue_create(this->domain_id, 0, // Flags - 128 * 1024, // Request queue size (in bytes) - 64 * 1024, // Response queue size (in bytes) + req_q_size, // Request queue size (in bytes) + rsp_q_size, // Response queue size (in bytes) nullptr, // Read packet callback (we handle reads explicitly) nullptr, // Error callback (we handle errors during reads) (void *) this, // Callback context @@ -1715,6 +2031,10 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { throw std::runtime_error("ggml-hex: iface start failed (see log for details)"); } this->valid_iface = true; + + // Allocate buffers and state for op batching + this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch); + this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue); } void ggml_hexagon_session::release() noexcept(true) { @@ -1722,6 +2042,9 @@ void ggml_hexagon_session::release() noexcept(true) { int err; + delete this->op_batch; + delete this->op_shm; + // Stop the DSP-side service and close the queue if (this->valid_iface) { err = htp_iface_stop(this->handle); @@ -1753,6 +2076,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n buffer_type.device = dev; repack_buffer_type.device = dev; + op_batch = nullptr; + op_shm = nullptr; + try { allocate(dev_id); @@ -1815,9 +2141,13 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess return false; } - return opt_experimental; -} + if (dst->ne[2] != 1 || dst->ne[3] != 1) { + // FA during prompt still needs work + return false; + } + return true; +} static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; @@ -2082,6 +2412,23 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s } } + // Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES + // The HVX softmax implementation has issues with tail handling for larger non-aligned sizes + // Small sizes (ne[0] <= 32) work correctly with tail-only processing + const int64_t ne0 = src0->ne[0]; + if (ne0 > 32 && (ne0 & (32 - 1)) != 0) { + return false; + } + + // HVX vector size constraints for softmax + #define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision + + // Reject very large row sizes to avoid numerical precision issues + // Softmax accumulation over many elements can lead to precision loss + if (ne0 > SOFTMAX_MAX_ROW_SIZE) { + return false; + } + return true; } @@ -2249,388 +2596,9 @@ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * se return true; } -enum dspqbuf_type { - DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0, - DSPQBUF_TYPE_CPU_WRITE_DSP_READ, - DSPQBUF_TYPE_CONSTANT, -}; - -static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) { - if (opt_verbose < 2) return; - - auto buf = static_cast(t->buffer->context); - auto sess = buf->sess; - - GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(), - t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset, - (unsigned int) d->size); -} - -// Init hexagon tensor from GGML tensor and Hexagon buffer -static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) { - h->data = 0; // updated by the receiver - h->type = t->type; - h->ne[0] = t->ne[0]; - h->ne[1] = t->ne[1]; - h->ne[2] = t->ne[2]; - h->ne[3] = t->ne[3]; - h->nb[0] = t->nb[0]; - h->nb[1] = t->nb[1]; - h->nb[2] = t->nb[2]; - h->nb[3] = t->nb[3]; -} - -static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) { - if (!t) { - return 0; - } - - auto buf = static_cast(t->buffer->context); - - memset(d, 0, sizeof(*d)); - d->fd = buf->fd; - d->ptr = t->data; - d->offset = (uint8_t *) t->data - buf->base; - d->size = ggml_nbytes(t); - - if (!d->size) { - // Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty - d->size = 64; - } - - switch (type) { - case DSPQBUF_TYPE_DSP_WRITE_CPU_READ: - // Flush CPU - d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER; - break; - case DSPQBUF_TYPE_CPU_WRITE_DSP_READ: - // Flush CPU, Invalidate DSP - d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; - break; - default: - // Constant buffer, no cache maintenance - d->flags = 0; - break; - } - - htp_req_tensor_init(h, t); - - dspqbuf_dump(d, t, type); - - return 1; -} - -typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op); - -template -static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) { - uint64_t t = ggml_time_us(); - - // Construct HTP request - htp_general_req req; - memset(&req, 0, sizeof(req)); - - req.flags = flags; - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - ggml_hexagon_dump_op_exec(sess->name, op, req.flags); - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS]; - size_t n_bufs = _init_req_func(&req, bufs, op); - sess->enqueue(req, bufs, n_bufs, opt_opsync); - } - - t = ggml_time_us() - t; - - ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t); -} - -template -static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - switch (t->op) { - case GGML_OP_MUL_MAT: - req->op = HTP_OP_MUL_MAT; - break; - case GGML_OP_MUL: - req->op = HTP_OP_MUL; - break; - case GGML_OP_ADD: - req->op = HTP_OP_ADD; - break; - case GGML_OP_SUB: - req->op = HTP_OP_SUB; - break; - case GGML_OP_DIV: - req->op = HTP_OP_DIV; - break; - default: - GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op); - break; - } - - // src0: Weights (mulmat) or First Operand (binary op). - // If constant (e.g. weights), no cache management is needed. - // src1: Input Activations (mulmat) or Second Operand (binary op). - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_CPY; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_cont_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - // CONT is just a contiguous copy — reuse CPY op - req->op = HTP_OP_CPY; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_repeat_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_REPEAT; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_cumsum_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_CUMSUM; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_GET_ROWS; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_argsort_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_ARGSORT; - memcpy(&req->op_params, &t->op_params, sizeof(t->op_params)); - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -template -static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - switch (t->op) { - case GGML_OP_MUL_MAT_ID: - req->op = HTP_OP_MUL_MAT_ID; - break; - case GGML_OP_ADD_ID: - req->op = HTP_OP_ADD_ID; - break; - default: - GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op); - } - - // src0: Weights (mulmat) or Input Activations (other op). - // If constant, no cache management is needed. - // src1: Input Activations (mulmat) or Second Operand (binary op). - // src2: Expert IDs (mulmat) or Activated Experts (other op). - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_SET_ROWS; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - memcpy(&req->op_params, &t->op_params, sizeof(t->op_params)); - - bool supported = false; - - switch (t->op) { - case GGML_OP_RMS_NORM: - req->op = HTP_OP_RMS_NORM; - supported = true; - break; - - case GGML_OP_SCALE: - req->op = HTP_OP_SCALE; - supported = true; - break; - - case GGML_OP_SQR: - req->op = HTP_OP_SQR; - supported = true; - break; - - case GGML_OP_SQRT: - req->op = HTP_OP_SQRT; - supported = true; - break; - - case GGML_OP_UNARY: - switch (ggml_get_unary_op(t)) { - case GGML_UNARY_OP_SILU: - req->op = HTP_OP_UNARY_SILU; - supported = true; - break; - case GGML_UNARY_OP_GELU: - req->op = HTP_OP_UNARY_GELU; - supported = true; - break; - case GGML_UNARY_OP_SIGMOID: - req->op = HTP_OP_UNARY_SIGMOID; - supported = true; - break; - case GGML_UNARY_OP_NEG: - req->op = HTP_OP_UNARY_NEG; - supported = true; - break; - case GGML_UNARY_OP_EXP: - req->op = HTP_OP_UNARY_EXP; - supported = true; - break; - case GGML_UNARY_OP_SOFTPLUS: - req->op = HTP_OP_UNARY_SOFTPLUS; - supported = true; - break; - default: - break; - } - break; - - case GGML_OP_GLU: - if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) { - req->op = HTP_OP_GLU_SWIGLU; - supported = true; - } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) { - req->op = HTP_OP_GLU_SWIGLU_OAI; - supported = true; - } else if (ggml_get_glu_op(t) == GGML_GLU_OP_GEGLU) { - req->op = HTP_OP_GLU_GEGLU; - supported = true; - } - break; - - case GGML_OP_SOFT_MAX: - req->op = HTP_OP_SOFTMAX; - supported = true; - break; - - default: - break; - } - - if (!supported) { - GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op); - } - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_sum_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - memcpy(&req->op_params, &t->op_params, sizeof(t->op_params)); - req->op = HTP_OP_SUM_ROWS; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - memcpy(&req->op_params, &t->op_params, sizeof(t->op_params)); - req->op = HTP_OP_ROPE; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - memcpy(&req->op_params, &t->op_params, sizeof(t->op_params)); - req->op = HTP_OP_FLASH_ATTN_EXT; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - -static inline size_t init_ssm_conv_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) { - req->op = HTP_OP_SSM_CONV; - - size_t n_bufs = 0; - n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ); - n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CONSTANT); - n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ); - - return n_bufs; -} - static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { auto sess = static_cast(backend->context); - return sess->name.c_str(); + return sess->c_name(); } static void ggml_backend_hexagon_free(ggml_backend_t backend) { @@ -2639,181 +2607,74 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) { delete backend; } -// Map weight type to its activation quantization family. -// Types in the same family produce identical Q8 formats in VTCM and can -// safely share quantized activation data via SKIP_QUANTIZE. -// When adding a new quantized type, assign it the correct family here. -static inline int act_quant_family(enum ggml_type wtype) { - switch (wtype) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q8_0: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - return 1; // Q8x4x2 +static htp_op_code op_remap_to_htp(const ggml_tensor * t) { + switch (t->op) { + case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT; + case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT; + case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID; + case GGML_OP_MUL: return HTP_OP_MUL; + case GGML_OP_ADD: return HTP_OP_ADD; + case GGML_OP_ADD_ID: return HTP_OP_ADD_ID; + case GGML_OP_SUB: return HTP_OP_SUB; + case GGML_OP_DIV: return HTP_OP_DIV; + case GGML_OP_CPY: return HTP_OP_CPY; + case GGML_OP_CONT: return HTP_OP_CPY; + case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS; + case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; + case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; + case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; + case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; + case GGML_OP_SCALE: return HTP_OP_SCALE; + case GGML_OP_SQR: return HTP_OP_SQR; + case GGML_OP_SQRT: return HTP_OP_SQRT; + case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX; + case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV; + case GGML_OP_ROPE: return HTP_OP_ROPE; + case GGML_OP_REPEAT: return HTP_OP_REPEAT; + case GGML_OP_CUMSUM: return HTP_OP_CUMSUM; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(t)) { + case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; + case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU; + case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID; + case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG; + case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP; + case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS; + default: + break; + } + break; + + case GGML_OP_GLU: + switch (ggml_get_glu_op(t)) { + case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU; + case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI; + case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU; + default: break; + } + break; + default: - return 0; // unknown / not quantized + GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t)); } + return HTP_OP_INVALID; } -static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) { - return (op0 && op0->src[1] == op1->src[1] && - act_quant_family(op0->src[0]->type) == act_quant_family(op1->src[0]->type) && - act_quant_family(op0->src[0]->type) != 0); -} - -static inline bool is_compute_op(ggml_tensor *node) +static inline bool op_is_compute(ggml_tensor *node) { return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE); } -// scan the graph and figure out last compute op index -static inline int last_compute_op(ggml_cgraph * graph) { - int last = 0; - for (int i = 0; i < graph->n_nodes; ++i) { - if (is_compute_op(graph->nodes[i])) { - last = i; - } - } - - return last; -} - static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { auto sess = static_cast(backend->context); - HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes); - - const int last = last_compute_op(graph); - - const struct ggml_tensor * prev_op = nullptr; // prev executed op + HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes); for (int i = 0; i < graph->n_nodes; ++i) { - ggml_tensor * node = graph->nodes[i]; - - if (!is_compute_op(node)) { - continue; - } - - uint32_t flags = 0; - - // skip quantizer if src1 is reused - if (op_reuse_src1(node, prev_op)) { - flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - - prev_op = node; - - // ask for early notification for the last Op - if (i == last) { - flags |= HTP_OPFLAGS_EARLY_WAKEUP; - } - - switch (node->op) { - case GGML_OP_MUL_MAT: - if (ggml_is_quantized(node->src[0]->type)) { - ggml_hexagon_dispatch_op>(sess, node, flags); - } else { - ggml_hexagon_dispatch_op>(sess, node, flags); - } - break; - case GGML_OP_MUL_MAT_ID: - if (ggml_is_quantized(node->src[0]->type)) { - ggml_hexagon_dispatch_op>(sess, node, flags); - } else { - ggml_hexagon_dispatch_op>(sess, node, flags); - } - break; - case GGML_OP_MUL: - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_DIV: - ggml_hexagon_dispatch_op>(sess, node, flags); - break; - case GGML_OP_ADD_ID: - ggml_hexagon_dispatch_op>(sess, node, flags); - break; - case GGML_OP_RMS_NORM: - case GGML_OP_SCALE: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - case GGML_OP_SQR: - case GGML_OP_SQRT: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - case GGML_OP_SUM_ROWS: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - case GGML_OP_UNARY: - switch (ggml_get_unary_op(node)) { - case GGML_UNARY_OP_NEG: - case GGML_UNARY_OP_EXP: - case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_SOFTPLUS: - case GGML_UNARY_OP_SILU: - case GGML_UNARY_OP_GELU: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - default: - break; - } - break; - case GGML_OP_GLU: - switch (ggml_get_glu_op(node)) { - case GGML_GLU_OP_SWIGLU: - case GGML_GLU_OP_SWIGLU_OAI: - case GGML_GLU_OP_GEGLU: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - default: - break; - } - break; - case GGML_OP_SOFT_MAX: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_ROPE: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_FLASH_ATTN_EXT: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_SET_ROWS: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_GET_ROWS: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_CPY: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_CONT: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_REPEAT: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_ARGSORT: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_SSM_CONV: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - case GGML_OP_CUMSUM: - ggml_hexagon_dispatch_op(sess, node, flags); - break; - - default: - GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node)); + ggml_tensor * n = graph->nodes[i]; + if (op_is_compute(n)) { + sess->enqueue_op(op_remap_to_htp(n), n); } } @@ -2826,7 +2687,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) { auto sess = static_cast(backend->context); - HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str()); + HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name()); // Wait until all pending ops complete sess->flush(); @@ -3045,7 +2906,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) { auto sess = static_cast(dev->context); - return sess->name.c_str(); + return sess->c_name(); GGML_UNUSED(dev); } @@ -3056,8 +2917,7 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev } static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // ~2GB per session for now - *free = 2ULL * 1024 * 1024 * 1024; + *free = 0; *total = *free; GGML_UNUSED(dev); @@ -3172,6 +3032,11 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { auto sess = static_cast(dev->context); + // reject ops that match the filter + if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) { + return false; + } + // all srcs & dsts must be mapped to the same session if (!ggml_hexagon_supported_buffers(sess, op)) { ggml_hexagon_dump_op_supp(sess->name, op, false); @@ -3188,6 +3053,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = true; break; + case GGML_OP_MUL: + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_DIV: + supp = ggml_hexagon_supported_binary(sess, op); + break; + case GGML_OP_MUL_MAT: supp = ggml_hexagon_supported_mul_mat(sess, op); break; @@ -3196,13 +3068,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_mul_mat_id(sess, op); break; - case GGML_OP_MUL: - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_DIV: - supp = ggml_hexagon_supported_binary(sess, op); - break; - case GGML_OP_ADD_ID: supp = ggml_hexagon_supported_add_id(sess, op); break; @@ -3241,6 +3106,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons break; } break; + case GGML_OP_GLU: switch (ggml_get_glu_op(op)) { case GGML_GLU_OP_SWIGLU: @@ -3252,6 +3118,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons break; } break; + case GGML_OP_ROPE: supp = ggml_hexagon_supported_rope(sess, op); break; @@ -3438,11 +3305,13 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL, "please update hexagon_type to match ggml_type"); - const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL"); const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE"); const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF"); const char * str_opmask = getenv("GGML_HEXAGON_OPMASK"); const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC"); + const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH"); + const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE"); + const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER"); const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); const char * str_etm = getenv("GGML_HEXAGON_ETM"); const char * str_nhvx = getenv("GGML_HEXAGON_NHVX"); @@ -3450,16 +3319,21 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_ndev = getenv("GGML_HEXAGON_NDEV"); const char * str_arch = getenv("GGML_HEXAGON_ARCH"); - opt_experimental = str_experimental ? atoi(str_experimental) : 0; + auto RE_ICASE = std::regex_constants::icase; + + opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL; opt_verbose = str_verbose ? atoi(str_verbose) : 0; opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; - opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask; - opt_opsync = str_opsync ? atoi(str_opsync) : 0; + opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask; + opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync; + opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; + opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; opt_profile = str_profile ? atoi(str_profile) : 0; opt_etm = str_etm ? atoi(str_etm) : 0; opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx; opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; + opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) { opt_ndev = GGML_HEXAGON_MAX_SESSIONS; @@ -3472,12 +3346,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_arch = strtoul(str_arch, NULL, 0); } - opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1; - reg->context = new ggml_hexagon_registry(reg); - - HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req), - sizeof(struct htp_general_rsp)); } static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = { diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index d8b92498..6416d2df 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -14,59 +14,42 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" -#define htp_act_preamble3 \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t ne10 = src1->ne[0]; \ - const uint32_t ne11 = src1->ne[1]; \ - const uint32_t ne12 = src1->ne[2]; \ - const uint32_t ne13 = src1->ne[3]; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t nb10 = src1->nb[0]; \ - const uint32_t nb11 = src1->nb[1]; \ - const uint32_t nb12 = src1->nb[2]; \ - const uint32_t nb13 = src1->nb[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ - const uint32_t nb3 = dst->nb[3]; - -#define htp_act_preamble2 \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ +#define htp_act_preamble \ + const struct htp_tensor * src0 = actx->octx->src[0]; \ + const struct htp_tensor * src1 = actx->octx->src[1]; \ + const struct htp_tensor * dst = actx->octx->dst; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t ne10 = src1 ? src1->ne[0] : 0; \ + const uint32_t ne11 = src1 ? src1->ne[1] : 0; \ + const uint32_t ne12 = src1 ? src1->ne[2] : 0; \ + const uint32_t ne13 = src1 ? src1->ne[3] : 0; \ + \ + const uint32_t nb10 = src1 ? src1->nb[0] : 0; \ + const uint32_t nb11 = src1 ? src1->nb[1] : 0; \ + const uint32_t nb12 = src1 ? src1->nb[2] : 0; \ + const uint32_t nb13 = src1 ? src1->nb[3] : 0; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ const uint32_t nb3 = dst->nb[3]; struct htp_act_context { @@ -97,10 +80,7 @@ struct htp_act_context { static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_act_context * actx = (struct htp_act_context *) data; - const struct htp_tensor * src0 = &actx->octx->src0; - const struct htp_tensor * src1 = &actx->octx->src1; - const struct htp_tensor * dst = &actx->octx->dst; - htp_act_preamble3; + htp_act_preamble; size_t src0_row_size = actx->src0_row_size; size_t src1_row_size = actx->src1_row_size; @@ -207,10 +187,7 @@ static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_act_context * actx = (struct htp_act_context *) data; - const struct htp_tensor * src0 = &actx->octx->src0; - const struct htp_tensor * src1 = &actx->octx->src1; - const struct htp_tensor * dst = &actx->octx->dst; - htp_act_preamble3; + htp_act_preamble; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -332,9 +309,7 @@ static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, vo static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_act_context * actx = (struct htp_act_context *) data; - const struct htp_tensor * src0 = &actx->octx->src0; - const struct htp_tensor * dst = &actx->octx->dst; - htp_act_preamble2; + htp_act_preamble; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -433,9 +408,7 @@ static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * static void unary_silu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_act_context * actx = (struct htp_act_context *) data; - const struct htp_tensor * src0 = &actx->octx->src0; - const struct htp_tensor * dst = &actx->octx->dst; - htp_act_preamble2; + htp_act_preamble; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -533,10 +506,7 @@ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_act_context * actx = (struct htp_act_context *) data; - const struct htp_tensor * src0 = &actx->octx->src0; - const struct htp_tensor * src1 = &actx->octx->src1; - const struct htp_tensor * dst = &actx->octx->dst; - htp_act_preamble3; + htp_act_preamble; size_t src0_row_size = actx->src0_row_size; size_t src1_row_size = actx->src1_row_size; @@ -652,9 +622,9 @@ static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * } static int execute_op_activations_f32(struct htp_ops_context * octx) { - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) { FARF(ERROR, "Non-contiguous tensors are not supported at this time \n"); @@ -697,25 +667,20 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) { const uint32_t n_threads = MIN(octx->n_threads, src0_nrows); size_t src0_row_size = src0->nb[1]; - size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used + size_t src1_row_size = src1 ? src1->nb[1] : src0->nb[1]; size_t dst_row_size = dst->nb[1]; - const bool src1_valid = src1->ne[0]; - if (!src1_valid) { - src1_row_size = src0_row_size; - } - const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN); const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size - size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned; size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row); // Make sure the reserved vtcm size is sufficient - if(vtcm_row_per_thread ==0){ + if (vtcm_row_per_thread == 0) { FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size, spad_size_per_row * n_threads); return HTP_STATUS_VTCM_TOO_SMALL; @@ -733,7 +698,11 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) { octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; - if (src1->ne[0]) { + octx->src0_spad.src = NULL; + octx->src1_spad.src = NULL; + octx->dst_spad.src = NULL; + + if (src1) { FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, @@ -773,9 +742,9 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) { // Pointers and GLU logic const uint8_t * data_src0 = (const uint8_t *) src0->data; - const uint8_t * data_src1 = (const uint8_t *) src1->data; + const uint8_t * data_src1 = src1 ? (const uint8_t *) src1->data : NULL; - if (!src1_valid && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) { + if (!src1 && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) { const int32_t swapped = octx->op_params[1]; data_src1 = data_src0; actx.src1_row_size = actx.src0_row_size; @@ -799,7 +768,7 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) { int op_activations(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - switch (octx->src0.type) { + switch (octx->src[0]->type) { case HTP_TYPE_F32: err = execute_op_activations_f32(octx); break; diff --git a/ggml/src/ggml-hexagon/htp/argsort-ops.c b/ggml/src/ggml-hexagon/htp/argsort-ops.c index 3ec26a4c..bdd06236 100644 --- a/ggml/src/ggml-hexagon/htp/argsort-ops.c +++ b/ggml/src/ggml-hexagon/htp/argsort-ops.c @@ -12,7 +12,7 @@ #include "hex-dma.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #ifndef MIN @@ -175,8 +175,8 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = actx->octx; // Unpack context - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; // Scratchpad memory uint8_t * spad = octx->src0_spad.data + octx->src0_spad.size_per_thread * i; @@ -249,16 +249,16 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) { int op_argsort(struct htp_ops_context * octx) { // Check supported types - if (octx->src0.type != HTP_TYPE_F32) { + if (octx->src[0]->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; } - const uint32_t total_rows = octx->src0.ne[1] * octx->src0.ne[2] * octx->src0.ne[3]; + const uint32_t total_rows = octx->src[0]->ne[1] * octx->src[0]->ne[2] * octx->src[0]->ne[3]; const uint32_t n_threads = MIN(total_rows, octx->n_threads); // Allocate scratchpad // We need 1 row of float + 1 row of int32 per thread. - uint32_t ne00 = octx->src0.ne[0]; + uint32_t ne00 = octx->src[0]->ne[0]; size_t values_size = hex_round_up(ne00 * sizeof(float), 128); size_t indices_size = hex_round_up(ne00 * sizeof(int32_t), 128); size_t spad_per_thread = values_size + indices_size; @@ -278,9 +278,9 @@ int op_argsort(struct htp_ops_context * octx) { octx->src0_spad.size_per_thread = spad_per_thread; FARF(HIGH, "argsort: %ux%ux%ux%u -> %ux%ux%ux%u (0x%x, 0x%x)", - octx->src0.ne[0], octx->src0.ne[1], octx->src0.ne[2], octx->src0.ne[3], - octx->dst.ne[0], octx->dst.ne[1], octx->dst.ne[2], octx->dst.ne[3], - octx->src0.data, octx->dst.data); + octx->src[0]->ne[0], octx->src[0]->ne[1], octx->src[0]->ne[2], octx->src[0]->ne[3], + octx->dst->ne[0], octx->dst->ne[1], octx->dst->ne[2], octx->dst->ne[3], + octx->src[0]->data, octx->dst->data); struct htp_argsort_context actx; actx.octx = octx; diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index 1b0f9749..52013ad0 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -14,7 +14,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #ifndef MIN @@ -43,10 +43,10 @@ struct htp_binary_context { bool split_at_ne02; }; -#define htp_binary_preamble \ - const struct htp_tensor * src0 = &octx->src0; \ - const struct htp_tensor * src1 = &octx->src1; \ - struct htp_tensor * dst = &octx->dst; \ +#define htp_binary_preamble \ + const struct htp_tensor * src0 = octx->src[0]; \ + const struct htp_tensor * src1 = octx->src[1]; \ + const struct htp_tensor * dst = octx->dst; \ \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ @@ -181,7 +181,7 @@ static void binary_job_scalar(unsigned int nth, unsigned int ith, void * data) { struct htp_ops_context * octx = bctx->octx; htp_binary_preamble; - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16); const uint32_t total_rows = ne01 * ne02 * ne03; const uint32_t start_row = bctx->nrows_per_thread * ith; @@ -274,7 +274,7 @@ static void binary_job_vector_same_shape(unsigned int nth, unsigned int ith, voi struct htp_ops_context * octx = bctx->octx; htp_binary_preamble; - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16); const uint32_t total_rows = ne01 * ne02 * ne03; const uint32_t start_row = bctx->nrows_per_thread * ith; @@ -374,7 +374,7 @@ static void binary_job_vector_row_broadcast(unsigned int nth, unsigned int ith, struct htp_ops_context * octx = bctx->octx; htp_binary_preamble; - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16); const uint32_t total_rows = ne01 * ne02 * ne03; const uint32_t start_row = bctx->nrows_per_thread * ith; @@ -455,7 +455,7 @@ static void binary_job_vector_complex(unsigned int nth, unsigned int ith, void * struct htp_ops_context * octx = bctx->octx; htp_binary_preamble; - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const uint32_t row_size_bytes = (src0_type == HTP_TYPE_F32) ? ne00 * sizeof(float) : ne00 * sizeof(_Float16); const uint32_t total_rows = ne01 * ne02 * ne03; const uint32_t start_row = bctx->nrows_per_thread * ith; @@ -540,7 +540,7 @@ static void binary_job_element_repeat(unsigned int nth, unsigned int ith, void * struct htp_ops_context * octx = bctx->octx; htp_binary_preamble; - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const uint32_t elem_size_bytes = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16); const uint32_t row_size_bytes = ne00 * elem_size_bytes;; const uint32_t total_rows = ne01 * ne02 * ne03; @@ -629,10 +629,10 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) { struct htp_binary_context * bctx = (struct htp_binary_context *) data; struct htp_ops_context * octx = bctx->octx; - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - const struct htp_tensor * src2 = &octx->src2; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * src2 = octx->src[2]; + const struct htp_tensor * dst = octx->dst; const uint32_t ne00 = src0->ne[0]; const uint32_t ne01 = src0->ne[1]; @@ -723,15 +723,15 @@ static void binary_job_add_id(unsigned int nth, unsigned int ith, void * data) { } static int execute_op_binary(struct htp_ops_context * octx) { - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const uint32_t n_threads = MIN(octx->n_threads, src0_nrows); // Use packed row sizes for VTCM allocation - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; const size_t elem_size = (src0_type == HTP_TYPE_F32) ? sizeof(float) : sizeof(_Float16); const size_t src0_row_size = src0->ne[0] * elem_size; const size_t src1_row_size = src1->ne[0] * elem_size; @@ -799,9 +799,9 @@ static int execute_op_binary(struct htp_ops_context * octx) { return HTP_STATUS_VTCM_TOO_SMALL; } - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL; if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { return HTP_STATUS_OK; @@ -857,12 +857,12 @@ static int execute_op_binary(struct htp_ops_context * octx) { int op_binary(struct htp_ops_context * octx) { // Does not support permutations of src1 - const struct htp_tensor * src1 = &octx->src1; + const struct htp_tensor * src1 = octx->src[1]; if (src1->nb[1] < src1->nb[0]) { return HTP_STATUS_NO_SUPPORT; } - const uint32_t src0_type = octx->src0.type; + const uint32_t src0_type = octx->src[0]->type; if ((src0_type == HTP_TYPE_F32) || (src0_type == HTP_TYPE_F16)) { return execute_op_binary(octx); } diff --git a/ggml/src/ggml-hexagon/htp/cpy-ops.c b/ggml/src/ggml-hexagon/htp/cpy-ops.c index a40d866b..e5b9d350 100644 --- a/ggml/src/ggml-hexagon/htp/cpy-ops.c +++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c @@ -11,7 +11,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #include "hvx-utils.h" @@ -32,10 +32,10 @@ struct htp_copy_context { void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith); }; -#define cpy_preamble \ - struct htp_tensor *src0 = &octx->src0; \ - struct htp_tensor *dst = &octx->dst; \ - \ +#define cpy_preamble \ + const struct htp_tensor *src0 = octx->src[0]; \ + const struct htp_tensor *dst = octx->dst; \ + \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ const uint32_t ne02 = src0->ne[2]; \ diff --git a/ggml/src/ggml-hexagon/htp/cumsum-ops.c b/ggml/src/ggml-hexagon/htp/cumsum-ops.c index ce51555a..2ced1971 100644 --- a/ggml/src/ggml-hexagon/htp/cumsum-ops.c +++ b/ggml/src/ggml-hexagon/htp/cumsum-ops.c @@ -13,9 +13,9 @@ #include "hvx-utils.h" #include "hex-dma.h" -#define htp_cumsum_tensors_preamble \ - struct htp_tensor * restrict src0 = &octx->src0; \ - struct htp_tensor * restrict dst = &octx->dst; \ +#define htp_cumsum_tensors_preamble \ + const struct htp_tensor * restrict src0 = octx->src[0]; \ + const struct htp_tensor * restrict dst = octx->dst; \ \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ @@ -206,8 +206,8 @@ static void cumsum_thread_f32(unsigned int nth, unsigned int ith, void * data) { } int op_cumsum_f32(struct htp_ops_context * octx) { - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { return HTP_STATUS_OK; @@ -226,10 +226,12 @@ int op_cumsum_f32(struct htp_ops_context * octx) { octx->src0_spad.size_per_thread = src_row_size_aligned * 2; octx->dst_spad.size_per_thread = dst_row_size_aligned * 2; - octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; - octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + + octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL; struct htp_cumsum_context cctx = { .octx = octx, @@ -251,8 +253,9 @@ int op_cumsum_f32(struct htp_ops_context * octx) { } int op_cumsum(struct htp_ops_context * octx) { - int err = HTP_STATUS_OK; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * dst = octx->dst; + + int err = HTP_STATUS_OK; switch (dst->type) { case HTP_TYPE_F32: diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c index 0c9bc785..d296a322 100644 --- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c @@ -15,7 +15,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" // Must be multiple of 32 @@ -278,12 +278,12 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t * static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * data) { struct htp_fa_context * factx = (struct htp_fa_context *) data; const struct htp_ops_context * octx = factx->octx; - const struct htp_tensor * q = &octx->src0; - const struct htp_tensor * k = &octx->src1; - const struct htp_tensor * v = &octx->src2; - const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL; - const struct htp_tensor * sinks = (octx->src4.data) ? &octx->src4 : NULL; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * q = octx->src[0]; + const struct htp_tensor * k = octx->src[1]; + const struct htp_tensor * v = octx->src[2]; + const struct htp_tensor * mask = octx->src[3]; + const struct htp_tensor * sinks = octx->src[4]; + const struct htp_tensor * dst = octx->dst; const uint32_t neq0 = q->ne[0]; const uint32_t neq1 = q->ne[1]; @@ -610,11 +610,11 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * } int op_flash_attn_ext(struct htp_ops_context * octx) { - const struct htp_tensor * q = &octx->src0; - const struct htp_tensor * k = &octx->src1; - const struct htp_tensor * v = &octx->src2; - const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * q = octx->src[0]; + const struct htp_tensor * k = octx->src[1]; + const struct htp_tensor * v = octx->src[2]; + const struct htp_tensor * mask = octx->src[3]; + const struct htp_tensor * dst = octx->dst; // Check support if ((q->type != HTP_TYPE_F16 && q->type != HTP_TYPE_F32) || k->type != HTP_TYPE_F16 || v->type != HTP_TYPE_F16) { @@ -701,13 +701,11 @@ int op_flash_attn_ext(struct htp_ops_context * octx) { return HTP_STATUS_VTCM_TOO_SMALL; } - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size; - octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size; - octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size; - - // FARF(ERROR, "fa: qrows-per-thread %u", factx.qrows_per_thread); + octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL; + octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->src2_spad.src = NULL; + octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size; octx->src3_spad.src = NULL; + octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size; octx->dst_spad.src = NULL; if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads); diff --git a/ggml/src/ggml-hexagon/htp/get-rows-ops.c b/ggml/src/ggml-hexagon/htp/get-rows-ops.c index 047d2850..5a1dc933 100644 --- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c +++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c @@ -11,7 +11,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #include "hvx-utils.h" @@ -23,27 +23,33 @@ struct get_rows_context { }; #define get_rows_preamble \ - const uint32_t ne00 = octx->src0.ne[0]; \ - const uint32_t ne01 = octx->src0.ne[1]; \ - const uint32_t ne02 = octx->src0.ne[2]; \ - const uint32_t ne03 = octx->src0.ne[3]; \ - \ - const uint32_t ne10 = octx->src1.ne[0]; \ - const uint32_t ne11 = octx->src1.ne[1]; \ - const uint32_t ne12 = octx->src1.ne[2]; \ - \ - const uint32_t nb01 = octx->src0.nb[1]; \ - const uint32_t nb02 = octx->src0.nb[2]; \ - const uint32_t nb03 = octx->src0.nb[3]; \ - \ - const uint32_t nb10 = octx->src1.nb[0]; \ - const uint32_t nb11 = octx->src1.nb[1]; \ - const uint32_t nb12 = octx->src1.nb[2]; \ - \ - const uint32_t nb1 = octx->dst.nb[1]; \ - const uint32_t nb2 = octx->dst.nb[2]; \ - const uint32_t nb3 = octx->dst.nb[3]; \ - \ + const uint32_t ne00 = octx->src[0]->ne[0]; \ + const uint32_t ne01 = octx->src[0]->ne[1]; \ + const uint32_t ne02 = octx->src[0]->ne[2]; \ + const uint32_t ne03 = octx->src[0]->ne[3]; \ + \ + const uint32_t ne10 = octx->src[1]->ne[0]; \ + const uint32_t ne11 = octx->src[1]->ne[1]; \ + const uint32_t ne12 = octx->src[1]->ne[2]; \ + const uint32_t ne13 = octx->src[1]->ne[3]; \ + \ + const uint32_t ne0 = octx->dst->ne[0]; \ + const uint32_t ne1 = octx->dst->ne[1]; \ + const uint32_t ne2 = octx->dst->ne[2]; \ + const uint32_t ne3 = octx->dst->ne[3]; \ + \ + const uint32_t nb01 = octx->src[0]->nb[1]; \ + const uint32_t nb02 = octx->src[0]->nb[2]; \ + const uint32_t nb03 = octx->src[0]->nb[3]; \ + \ + const uint32_t nb10 = octx->src[1]->nb[0]; \ + const uint32_t nb11 = octx->src[1]->nb[1]; \ + const uint32_t nb12 = octx->src[1]->nb[2]; \ + \ + const uint32_t nb1 = octx->dst->nb[1]; \ + const uint32_t nb2 = octx->dst->nb[2]; \ + const uint32_t nb3 = octx->dst->nb[3]; \ + \ const uint32_t nr = ne10 * ne11 * ne12; static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) { @@ -51,12 +57,14 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da struct htp_ops_context * octx = grctx->octx; get_rows_preamble; + uint64_t qt = HAP_perf_get_qtimer_count(); + // parallelize by src1 elements (which correspond to dst rows) const uint32_t dr = grctx->src1_nrows_per_thread; const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; - const bool is_i32 = (octx->src1.type == HTP_TYPE_I32); + const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); for (uint32_t i = ir0; i < ir1; ++i) { const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11); @@ -64,7 +72,7 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10); const uint32_t i10 = rem - i11 * ne10; - const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12; + const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12; uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr; @@ -73,10 +81,14 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da continue; } - const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03; - const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3; + const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03; + const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3; hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); } + + qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); + FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt); } int op_get_rows(struct htp_ops_context * octx) { @@ -84,15 +96,15 @@ int op_get_rows(struct htp_ops_context * octx) { const uint32_t n_threads = MIN(nr, octx->n_threads); - if (octx->src0.type != HTP_TYPE_F32) { + if (octx->src[0]->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; } - if (octx->dst.type != HTP_TYPE_F32) { + if (octx->dst->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; } - if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) { + if (octx->src[1]->type != HTP_TYPE_I32 && octx->src[1]->type != HTP_TYPE_I64) { return HTP_STATUS_NO_SUPPORT; } @@ -102,8 +114,8 @@ int op_get_rows(struct htp_ops_context * octx) { struct get_rows_context grctx; grctx.octx = octx; - grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]); - grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]); + grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src[1]->ne[0]); + grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]); grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads; diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h index 8ed1456b..fe0b661e 100644 --- a/ggml/src/ggml-hexagon/htp/hex-utils.h +++ b/ggml/src/ggml-hexagon/htp/hex-utils.h @@ -3,8 +3,10 @@ #include #include +#include #include "hexagon_types.h" +#include "hexagon_protos.h" #include "hex-fastdiv.h" #include "hex-dump.h" @@ -68,4 +70,23 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, Q6_l2fetch_AP((void *) p, control); } +#define HEX_L2_LINE_SIZE 64 +#define HEX_L2_FLUSH_SIZE (128 * 1024) + +static inline void hex_l2flush(void * addr, size_t size) +{ + if (size > HEX_L2_FLUSH_SIZE) { + qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE); + } else { + const uint32_t s = (uint32_t) addr; + const uint32_t e = s + size; + for (uint32_t i = s; i < e; i += HEX_L2_LINE_SIZE * 4) { + Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 0); + Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 1); + Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 2); + Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 3); + } + } +} + #endif /* HEX_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index 4ff2b36d..ec191c14 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -20,7 +20,7 @@ #include "hvx-dump.h" #include "worker-pool.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "hmx-utils.h" #include "hmx-ops.h" @@ -821,7 +821,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu // and each q_head is computed individually to avoid tile-major packing // issues. m_chunk_n_rows is always a multiple of 32 (from // hmx_compute_chunks), so per-head tile arrays don't overlap. - const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vtcm_budget = ctx->vtcm_size; const size_t vec_dot_size = params->k * sizeof(__fp16); // When the activation has a large stride (e.g. permuted Q tensor with @@ -998,7 +998,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co } // --- Dynamic VTCM layout --- - const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vtcm_budget = ctx->vtcm_size; const size_t vec_dot_size = k * sizeof(__fp16); // DMA-based activation gather for strided tensors (see batched path comment). @@ -1182,7 +1182,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds FARF(MEDIUM, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type); // --- Dynamic VTCM layout --- - const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vtcm_budget = ctx->vtcm_size; const size_t vec_dot_size = k * sizeof(__fp16); const bool use_pipeline = (m >= 128) && (k <= n); @@ -1273,9 +1273,6 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds void *buf_curr = vtcm_scratch0; void *buf_next = vtcm_scratch1; - // issue async DDR data transfer for the first weight chunk - // NOTE: use 2D DMA (n_cols rows x row_stride bytes) instead of 1D - // because UDMA roiwidth is 16-bit and total size can exceed 65535. { const size_t n_cols_first = hex_smin(n, n_chunk_n_cols); dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first); @@ -1533,20 +1530,15 @@ void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, co worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads); } -int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m, - int k, int n, int weight_type) { - // Runtime check -- k >= 16384 exceeds 2D DMA limit - if (k >= 16384) { - FARF(HIGH, "%s: k=%d exceeds 2D DMA limit", __func__, k); - return -1; - } +int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, + int m, int k, int n, int weight_type) { // assume k % 32 == 0 && n % 32 == 0 const size_t row_stride = get_x4x2_row_stride(weight_type, k); if (row_stride == 0) { return -1; } - const size_t vtcm_budget = ctx->vtcm_scratch_size; + const size_t vtcm_budget = ctx->vtcm_size; const size_t M_BLOCK_SIZE = 512; const size_t N_BLOCK_SIZE = 512; @@ -1576,8 +1568,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget); - FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", - __func__, m, k, n, weight_type, + FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type, (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); // initialize eye tile (32x32 identity matrix) diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.h b/ggml/src/ggml-hexagon/htp/hmx-ops.h index b36c8d12..fb95d36f 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-ops.h +++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h @@ -7,16 +7,12 @@ #include #include -#ifndef restrict -# define restrict __restrict -#endif +#include "htp-ops.h" #ifdef __cplusplus extern "C" { #endif -struct htp_context; // forward declaration - typedef struct { float *dst; const float *activation; diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 6f1917fa..4c36a6ea 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -2,6 +2,7 @@ #define HTP_CTX_H #include "hex-dma.h" +#include "htp-ops.h" #include "worker-pool.h" #include @@ -10,38 +11,85 @@ #include #define HTP_MAX_NTHREADS 10 +#define HTP_MAX_MMAPS 16 + +// Memory mapping +struct htp_mmap { + uint64_t size; + uint64_t base; + uint32_t fd; + uint32_t pinned; +}; + +// Scratchpad state +struct htp_spad { + const struct htp_tensor * src; // original src of the data (for reuse) + uint8_t * data; // pointer to an area in vtcm + uint32_t stride; // stride used inside this spad + uint32_t size; // total size + uint32_t size_per_thread; // size per thread +}; + +// Context while processing an Op +// TODO: fold this into the main context +struct htp_ops_context { + struct htp_context * ctx; + + enum htp_op_code op; // FIXME: rename to opcode + int32_t op_params[HTP_OP_MAX_PARAMS]; + + const struct htp_tensor * src[HTP_OP_MAX_INPUTS]; + const struct htp_tensor * dst; + + // TODO convert these to an array + struct htp_spad src0_spad; + struct htp_spad src1_spad; + struct htp_spad src2_spad; + struct htp_spad src3_spad; + struct htp_spad dst_spad; + + uint32_t n_threads; + uint32_t flags; +}; // Main context for htp DSP backend struct htp_context { - dspqueue_t queue; - dma_queue * dma[HTP_MAX_NTHREADS]; - worker_pool_context_t worker_pool; - uint32_t n_threads; + dspqueue_t queue; + dma_queue * dma[HTP_MAX_NTHREADS]; + struct htp_mmap mmap[HTP_MAX_MMAPS]; + worker_pool_context_t worker_pool; + uint32_t n_threads; - int thread_id; - int thread_prio; + int thread_id; + int thread_prio; - uint8_t * vtcm_base; - size_t vtcm_size; - uint32_t vtcm_rctx; + int hmx_enabled; - atomic_bool vtcm_valid; - atomic_bool vtcm_inuse; - atomic_bool vtcm_needs_release; + uint8_t * vtcm_base; + size_t vtcm_size; + uint32_t vtcm_rctx; + atomic_bool vtcm_valid; + atomic_bool vtcm_needs_release; - uint32_t opmask; - - // Cached src1 spad position from the last quantize pass. - // When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM - // at this address; the matmul must read from here instead of recomputing - // the offset (which depends on the current op's src0 size). - uint8_t * prev_src1_spad; - - // HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX) -#ifdef HTP_HAS_HMX - int hmx_enabled; // Runtime flag: HMX initialisation succeeded - size_t vtcm_scratch_size; // Usable dynamic scratch (vtcm_size minus tail reservation) -#endif + struct htp_ops_context octx; }; +int op_matmul(struct htp_ops_context * octx); +int op_matmul_id(struct htp_ops_context * octx); +int op_binary(struct htp_ops_context * octx); +int op_unary(struct htp_ops_context * octx); +int op_sum_rows(struct htp_ops_context * octx); +int op_activations(struct htp_ops_context * octx); +int op_softmax(struct htp_ops_context * octx); +int op_add_id(struct htp_ops_context * octx); +int op_rope(struct htp_ops_context * octx); +int op_flash_attn_ext(struct htp_ops_context * octx); +int op_set_rows(struct htp_ops_context * octx); +int op_get_rows(struct htp_ops_context * octx); +int op_cpy(struct htp_ops_context * octx); +int op_repeat(struct htp_ops_context * octx); +int op_argsort(struct htp_ops_context * octx); +int op_ssm_conv(struct htp_ops_context * octx); +int op_cumsum(struct htp_ops_context * octx); + #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h deleted file mode 100644 index df0ea7cc..00000000 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ /dev/null @@ -1,166 +0,0 @@ -#ifndef HTP_MSG_H -#define HTP_MSG_H - -#include - -// ggml-common.h must be included prio to this header - -// Mask to enable various stages of the Ops. -// Used for debugging and profiling. -enum { - HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP) - HTP_OPMASK_QUANTIZE = (1 << 1), // Enable Quantize - HTP_OPMASK_COMPUTE = (1 << 2), // Enable Compute -}; - -// Op flags -enum { - HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0), // Skip dynamic quantization (reuse quantized tensors) - HTP_OPFLAGS_SKIP_COMPUTE = (1 << 1), // Skip actual computation (used for profiling) - HTP_OPFLAGS_EARLY_WAKEUP = (1 << 2) // Send early wakeup notification -}; - -enum htp_status { - HTP_STATUS_OK = 1, - HTP_STATUS_INTERNAL_ERR = 2, - HTP_STATUS_NO_SUPPORT = 3, - HTP_STATUS_INVAL_PARAMS = 4, - HTP_STATUS_VTCM_TOO_SMALL = 5, -}; - -// The values must match the ggml_type. -// Duplicated here because we can't include full ggml.h in the htp build. -// We have some static_asserts in the cpp code to ensure things are in sync. -enum htp_data_type { - HTP_TYPE_F32 = 0, - HTP_TYPE_F16 = 1, - HTP_TYPE_Q4_0 = 2, - HTP_TYPE_Q8_0 = 8, - HTP_TYPE_IQ4_NL = 20, - HTP_TYPE_I32 = 26, - HTP_TYPE_I64 = 27, - HTP_TYPE_MXFP4 = 39, - HTP_TYPE_COUNT -}; - -// Do not reorder first 4 (used as an index) -enum htp_op { - HTP_OP_MUL = 0, - HTP_OP_ADD = 1, - HTP_OP_SUB = 2, - HTP_OP_DIV = 3, - HTP_OP_MUL_MAT, - HTP_OP_MUL_MAT_ID, - HTP_OP_RMS_NORM, - HTP_OP_UNARY_SILU, - HTP_OP_UNARY_GELU, - HTP_OP_UNARY_SIGMOID, - HTP_OP_UNARY_EXP, - HTP_OP_UNARY_NEG, - HTP_OP_UNARY_SOFTPLUS, - HTP_OP_GLU_SWIGLU, - HTP_OP_GLU_SWIGLU_OAI, - HTP_OP_GLU_GEGLU, - HTP_OP_SOFTMAX, - HTP_OP_ADD_ID, - HTP_OP_ROPE, - HTP_OP_FLASH_ATTN_EXT, - HTP_OP_SET_ROWS, - HTP_OP_GET_ROWS, - HTP_OP_SCALE, - HTP_OP_CPY, - HTP_OP_ARGSORT, - HTP_OP_SQR, - HTP_OP_SQRT, - HTP_OP_SUM_ROWS, - HTP_OP_SSM_CONV, - HTP_OP_REPEAT, - HTP_OP_CUMSUM, - INVALID -}; - -static inline size_t htp_t_block_size(uint32_t t) { - switch (t) { - case HTP_TYPE_F32: - return 1; - case HTP_TYPE_F16: - return 1; - case HTP_TYPE_Q4_0: - return QK4_0; - case HTP_TYPE_Q8_0: - return QK8_0; - case HTP_TYPE_IQ4_NL: - return QK4_NL; - case HTP_TYPE_MXFP4: - return QK_MXFP4; - default: - assert(0 && "unsupported HTP data type"); - } - return 0; -} - -static inline size_t htp_type_nbytes(uint32_t t) { - switch (t) { - case HTP_TYPE_F32: - return 4; - case HTP_TYPE_F16: - return 2; - case HTP_TYPE_Q4_0: - return sizeof(block_q4_0); - case HTP_TYPE_Q8_0: - return sizeof(block_q8_0); - case HTP_TYPE_IQ4_NL: - return sizeof(block_iq4_nl); - case HTP_TYPE_MXFP4: - return sizeof(block_mxfp4); - default: - assert(0 && "unsupported HTP data type"); - } - return 0; -} - -// Internal types -#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128) -#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks -#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks - -#define HTP_MAX_DIMS 4 - -struct htp_tensor { - uint32_t data; // Buffer offset in the messages, and data pointer on the NSP - uint32_t type; // Data type - uint32_t ne[HTP_MAX_DIMS]; // Number of elements - uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) -}; - -#define HTP_MAX_OP_PARAMS 64 - -struct htp_general_req { - uint32_t op; // GGML/HTP Op - int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)]; - // Params for the op, e.g. epsilon of RMS norm - uint32_t flags; // Request flags - - struct htp_tensor src0; // Input0 tensor - struct htp_tensor src1; // Input1 tensor - struct htp_tensor src2; // Input2 tensor - struct htp_tensor src3; // Input3 tensor - struct htp_tensor src4; // Input4 tensor - struct htp_tensor dst; // Output tensor - - // should be multiple of 64 bytes (cacheline) -}; - -struct htp_general_rsp { - uint32_t op; // GGML/HTP Op - uint32_t status; // HTP_STATUS_... - uint32_t prof_usecs; // Number of usec per request - uint32_t prof_cycles; // Number of cycles per request - uint32_t prof_pkts; // Number of instruction packets per request - uint8_t unused[44]; // Pad to 64 bytes -}; - -#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req) -#define HTP_MAX_PACKET_BUFFERS 8 - -#endif /* HTP_MSG_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index d35decaa..44a6ab4f 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -1,65 +1,154 @@ #ifndef HTP_OPS_H #define HTP_OPS_H -#include "htp-ctx.h" -#include "htp-msg.h" -#include "worker-pool.h" - #include -#include -#include +// ggml-common.h must be included prio to this header -// ggml-common.h must be included prior to this header - -struct htp_spad { - uint8_t * data; - size_t stride; - size_t size; - size_t size_per_thread; +enum htp_status { + HTP_STATUS_OK = 1, + HTP_STATUS_INTERNAL_ERR = 2, + HTP_STATUS_NO_SUPPORT = 3, + HTP_STATUS_INVAL_PARAMS = 4, + HTP_STATUS_VTCM_TOO_SMALL = 5, }; -struct htp_ops_context { - struct htp_context * ctx; +// First set of values must match the ggml_type. +// Duplicated here because we can't include full ggml.h in the htp build. +// We have some static_asserts in the cpp code to ensure things are in sync. +enum htp_data_type { + HTP_TYPE_F32 = 0, + HTP_TYPE_F16 = 1, + HTP_TYPE_Q4_0 = 2, + HTP_TYPE_Q8_0 = 8, + HTP_TYPE_IQ4_NL = 20, + HTP_TYPE_I32 = 26, + HTP_TYPE_I64 = 27, + HTP_TYPE_MXFP4 = 39, - enum htp_op op; - int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)]; + // types used internally for repack, dyn.quant, etc + HTP_TYPE_Q4_0x4x2 = 200, + HTP_TYPE_Q8_0x4x2, + HTP_TYPE_MXFP4x4x2, - struct htp_tensor src0; - struct htp_tensor src1; - struct htp_tensor src2; - struct htp_tensor src3; - struct htp_tensor src4; - struct htp_tensor dst; - - struct htp_spad src0_spad; - struct htp_spad src1_spad; - struct htp_spad src2_spad; - struct htp_spad src3_spad; - struct htp_spad dst_spad; - - worker_pool_context_t * wpool; // worker pool - uint32_t n_threads; // num threads - - uint32_t flags; + HTP_TYPE_INVALID }; -int op_matmul(struct htp_ops_context * octx); -int op_matmul_id(struct htp_ops_context * octx); -int op_binary(struct htp_ops_context * octx); -int op_unary(struct htp_ops_context * octx); -int op_sum_rows(struct htp_ops_context * octx); -int op_activations(struct htp_ops_context * octx); -int op_softmax(struct htp_ops_context * octx); -int op_add_id(struct htp_ops_context * octx); -int op_rope(struct htp_ops_context * octx); -int op_flash_attn_ext(struct htp_ops_context * octx); -int op_set_rows(struct htp_ops_context * octx); -int op_get_rows(struct htp_ops_context * octx); -int op_cpy(struct htp_ops_context * octx); -int op_repeat(struct htp_ops_context * octx); -int op_argsort(struct htp_ops_context * octx); -int op_ssm_conv(struct htp_ops_context * octx); -int op_cumsum(struct htp_ops_context * octx); +// Constats for internal types +#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128) +#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks +#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks + + +// Mask to enable various stages of the Ops. +// Used for debugging and profiling. +enum htp_op_mask { + HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP) + HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute +}; + +// Do not reorder first 4 (used as an index) +enum htp_op_code { + HTP_OP_MUL = 0, + HTP_OP_ADD = 1, + HTP_OP_SUB = 2, + HTP_OP_DIV = 3, + HTP_OP_MUL_MAT, + HTP_OP_MUL_MAT_ID, + HTP_OP_RMS_NORM, + HTP_OP_UNARY_SILU, + HTP_OP_UNARY_GELU, + HTP_OP_UNARY_SIGMOID, + HTP_OP_UNARY_EXP, + HTP_OP_UNARY_NEG, + HTP_OP_UNARY_SOFTPLUS, + HTP_OP_GLU_SWIGLU, + HTP_OP_GLU_SWIGLU_OAI, + HTP_OP_GLU_GEGLU, + HTP_OP_SOFTMAX, + HTP_OP_ADD_ID, + HTP_OP_ROPE, + HTP_OP_FLASH_ATTN_EXT, + HTP_OP_SET_ROWS, + HTP_OP_GET_ROWS, + HTP_OP_SCALE, + HTP_OP_CPY, + HTP_OP_ARGSORT, + HTP_OP_SQR, + HTP_OP_SQRT, + HTP_OP_SUM_ROWS, + HTP_OP_SSM_CONV, + HTP_OP_REPEAT, + HTP_OP_CUMSUM, + + HTP_OP_INVALID +}; + +#define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS +#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS +#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS + +#define HTP_OP_MAX_BUFS 8 +#define HTP_OP_MAX_REQS 256 +#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS) +#define HTP_OP_MAX_VMEM (3221225472u) + +enum htp_tensor_flags { + HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights) + HTP_TENSOR_FLUSHED = (1U << 1) // Tensor buffer has been flushed (set by the NPU) +}; + +// Tensor descriptor +struct htp_tensor { + uint32_t data; // Buffer offset in the messages, and data pointer on the NPU + uint32_t size; // Data size in bytes + uint32_t flags; // Buffer / tensor flags + uint16_t type; // Data type + uint16_t bi; // Buffer index + uint32_t ne[HTP_OP_MAX_DIMS]; // Number of elements + uint32_t nb[HTP_OP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) +}; + +// Buffer descriptor +struct htp_buf_desc { + uint64_t base; // base address + uint64_t size; // total size + uint32_t flags; // buffer flags (unused) + uint32_t fd; // file descriptor +}; + +enum htp_op_flags { + HTP_OPFLAGS_SKIP_COMPUTE = (1U << 0), // Skip actual computation (used for profiling) +}; + +// Op descriptor +struct htp_op_desc { + uint32_t opcode; // GGML/HTP Op + uint32_t flags; // Op flags + int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm + uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices + uint16_t dst; // Output tensor index + + // the rest is filled in-place by the NPU + uint32_t prof_usecs; // Number of usec per request + uint32_t prof_cycles; // Number of cycles per request + uint32_t prof_pkts; // Number of instruction packets per request + uint32_t unused; +}; + +struct htp_opbatch_req { + uint32_t n_bufs; // Number of buffers + uint32_t n_tensors; // Number of tensors + uint32_t n_ops; // Number of ops + uint32_t flags; // unused + // struct htp_buf_desc bufs[]; -- dspqueue buf 0 + // struct htp_tensor tensors[]; -- dspqueue buf 0 + // struct htp_op_desc ops[]; -- dspqueue buf 0 +}; + +struct htp_opbatch_rsp { + uint32_t status; // HTP_STATUS_... + // struct htp_op_req ops[]; -- dspqueue buf 0 +}; #endif /* HTP_OPS_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl index 2dc716cb..3eb5d5a6 100644 --- a/ggml/src/ggml-hexagon/htp/htp_iface.idl +++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl @@ -9,6 +9,8 @@ interface htp_iface : remote_handle64 { AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx); AEEResult stop(); + AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned); + AEEResult munmap(in uint32 fd); AEEResult enable_etm(); AEEResult disable_etm(); }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 6f37bf9d..8b347039 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -1,5 +1,7 @@ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" #pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" #include #include @@ -12,6 +14,7 @@ #include #include #include +#include #include #include @@ -21,14 +24,10 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #include "worker-pool.h" -#ifdef HTP_HAS_HMX -#include "hmx-ops.h" -#endif // HTP_HAS_HMX - AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { struct htp_context * ctx; int err = 0; @@ -38,7 +37,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { return AEE_ENOMEMORY; } - // Use the context structure as a handle + // Use the context structure as the handle *handle = (remote_handle64) ctx; // Enable FARF logs @@ -115,6 +114,16 @@ AEEResult htp_iface_close(remote_handle64 handle) { return AEE_EITEMBUSY; } + // release the mmaps (if any) + for (uint32_t i=0; immap[i].size) { + HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size); + ctx->mmap[i].size = 0; + ctx->mmap[i].base = NULL; + ctx->mmap[i].fd = -1; + } + } + free(ctx); return AEE_SUCCESS; } @@ -143,66 +152,93 @@ AEEResult htp_iface_disable_etm(remote_handle64 handle) { return err; } -static int vtcm_acquire(struct htp_context * ctx) { - int err; - if (!ctx->vtcm_valid) { - // Temporarily bump thread priority to make sure it's higher than other sessions. - // This way the resource manager will notify the other thread to release VTCM. - // Note that we need to reaquire VTCM at normal priority for this to work next time. - qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10); - err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); - if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); - abort(); - } - HAP_compute_res_release_cached(ctx->vtcm_rctx); - qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio); - - err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); - if (err != 0) { - FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err); - abort(); - } - ctx->vtcm_valid = true; +AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) { + struct htp_context * ctx = (struct htp_context *) handle; + if (!ctx) { + return AEE_EBADPARM; } - ctx->vtcm_inuse = true; + // See if we already have this mapping + for (uint32_t i=0; immap[i]; + if (m->fd == fd) { + m->pinned = pinned; + return AEE_SUCCESS; + } + } + // Add new mapping + for (uint32_t i=0; immap[i]; + if (!m->size) { + FARF(HIGH, "mmap : fd %u size %u pinned %u", fd, size, pinned); + void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0); + if (va == (void*)-1) { + FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size); + return AEE_EFAILED; + } - return 0; + m->base = (uint64_t) va; + m->fd = fd; + m->size = size; + m->pinned = pinned; + + return AEE_SUCCESS; + } + } + + return AEE_ENOMEMORY; } -static int vtcm_release(struct htp_context * ctx) { - ctx->vtcm_inuse = false; +AEEResult htp_iface_munmap(remote_handle64 handle, int fd) { + struct htp_context * ctx = (struct htp_context *) handle; + if (!ctx) { + return AEE_EBADPARM; + } - if (ctx->vtcm_valid && ctx->vtcm_needs_release) { + for (uint32_t i=0; immap[i]; + if (fd < 0 || m->fd == fd) { + FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size); + HAP_munmap2((void *) m->base, m->size); + m->size = 0; + m->base = NULL; + m->fd = -1; + m->pinned = 0; + } + } + + return AEE_SUCCESS; +} + +static void vtcm_acquire(struct htp_context * ctx) { + if (!ctx->vtcm_valid) { + int err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000u); + if (err != 0) { + FARF(ERROR, "ggml-hex: failed to acquire VTCM: 0x%08x", (unsigned)err); + abort(); + } + + ctx->vtcm_needs_release = false; + ctx->vtcm_valid = true; + + // Drop the priority to make sure we get the release callback from other GGML-HTP and QNN-HTP sessions + HAP_compute_res_update_priority(ctx->vtcm_rctx, ctx->thread_prio + 10); + } +} + +static void vtcm_release(struct htp_context * ctx) { + if (ctx->vtcm_valid) { ctx->vtcm_valid = false; ctx->vtcm_needs_release = false; HAP_compute_res_release_cached(ctx->vtcm_rctx); } - - return 0; } static int vtcm_release_callback(unsigned int rctx, void * state) { struct htp_context * ctx = (struct htp_context *) state; - - if (!ctx || ctx->vtcm_rctx != rctx) { - return AEE_EBADPARM; - } - - // If VTCM is not inuse (not processing Ops) release it right here - // otherwise we'll release it once we're done with the current Op. - - if (ctx->vtcm_inuse) { - ctx->vtcm_needs_release = true; - return 0; - } - - ctx->vtcm_valid = false; - HAP_compute_res_release_cached(ctx->vtcm_rctx); - + ctx->vtcm_needs_release = true; return 0; } @@ -236,7 +272,6 @@ static int vtcm_alloc(struct htp_context * ctx) { ctx->vtcm_size = vtcm_size; ctx->vtcm_rctx = rctx; ctx->vtcm_valid = false; - ctx->vtcm_inuse = false; ctx->vtcm_needs_release = false; return 0; @@ -288,18 +323,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que } #ifdef HTP_HAS_HMX - if (use_hmx) { - ctx->vtcm_scratch_size = ctx->vtcm_size; - ctx->hmx_enabled = 1; - - FARF(HIGH, "HMX enabled: vtcm-scratch %zu", ctx->vtcm_scratch_size); - } else { - // HMX disabled: skip HMX initialisation so the - // dispatch loop falls through to the HVX compute paths. - ctx->hmx_enabled = 0; - ctx->vtcm_scratch_size = ctx->vtcm_size; - FARF(HIGH, "HMX disabled (use_hmx=0): vtcm-scratch %zu", ctx->vtcm_scratch_size); - } + ctx->hmx_enabled = use_hmx; + FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx); #endif qurt_sysenv_max_hthreads_t hw_threads; @@ -362,12 +387,10 @@ AEEResult htp_iface_stop(remote_handle64 handle) { for (int i = 0; i < ctx->n_threads; i++) { dma_queue_delete(ctx->dma[i]); } -#ifdef HTP_HAS_HMX - if (ctx->hmx_enabled) { - ctx->hmx_enabled = 0; - } -#endif +#ifdef HTP_HAS_HMX + ctx->hmx_enabled = 0; +#endif vtcm_free(ctx); @@ -397,1129 +420,320 @@ static inline void profile_stop(struct profile_data * d) { d->pkts = hex_get_pktcnt() - d->pkts; } -static int send_htp_rsp(struct htp_context * c, - uint32_t op, - uint32_t status, - struct dspqueue_buffer * bufs, - size_t n_bufs, - struct profile_data * prof) { - // Prep response struct (zero-init to clear cmp/unused union) - struct htp_general_rsp rsp; - memset(&rsp, 0, sizeof(rsp)); - rsp.op = op; - rsp.status = status; - rsp.prof_usecs = prof->usecs; - rsp.prof_cycles = prof->cycles; - rsp.prof_pkts = prof->pkts; +static int execute_op(struct htp_ops_context * octx) { + switch (octx->op) { + case HTP_OP_MUL_MAT: + return op_matmul(octx); - int err = dspqueue_write(c->queue, - 0, // Flags - n_bufs, - bufs, // Buffer references - sizeof(rsp), - (const uint8_t *) &rsp, // Message - DSPQUEUE_TIMEOUT_NONE); + case HTP_OP_MUL_MAT_ID: + return op_matmul_id(octx); - if (err != 0) { - FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err); + case HTP_OP_MUL: + case HTP_OP_ADD: + case HTP_OP_SUB: + case HTP_OP_DIV: + case HTP_OP_ADD_ID: + return op_binary(octx); + + case HTP_OP_RMS_NORM: + case HTP_OP_SCALE: + case HTP_OP_SQR: + case HTP_OP_SQRT: + case HTP_OP_UNARY_SOFTPLUS: + case HTP_OP_UNARY_SIGMOID: + case HTP_OP_UNARY_NEG: + case HTP_OP_UNARY_EXP: + return op_unary(octx); + + case HTP_OP_UNARY_SILU: + case HTP_OP_UNARY_GELU: + case HTP_OP_GLU_SWIGLU: + case HTP_OP_GLU_SWIGLU_OAI: + case HTP_OP_GLU_GEGLU: + return op_activations(octx); + + case HTP_OP_SOFTMAX: + return op_softmax(octx); + + case HTP_OP_ROPE: + return op_rope(octx); + + case HTP_OP_FLASH_ATTN_EXT: + return op_flash_attn_ext(octx); + + case HTP_OP_SET_ROWS: + return op_set_rows(octx); + + case HTP_OP_GET_ROWS: + return op_get_rows(octx); + + case HTP_OP_SUM_ROWS: + return op_sum_rows(octx); + + case HTP_OP_CPY: + return op_cpy(octx); + + case HTP_OP_REPEAT: + return op_repeat(octx); + + case HTP_OP_ARGSORT: + return op_argsort(octx); + + case HTP_OP_SSM_CONV: + return op_ssm_conv(octx); + + case HTP_OP_CUMSUM: + return op_cumsum(octx); + + case HTP_OP_INVALID: + break; + + // No default to catch missing cases } - return err; + FARF(ERROR, "Unknown Op %u", octx->op); + return -1; } -static void proc_matmul_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - size_t n_bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_matmul(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_argsort_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_argsort(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_cpy(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_repeat_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = op_repeat(&octx); - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_get_rows(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_matmul_id_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - size_t n_bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[3].fd; - rsp_bufs[0].ptr = bufs[3].ptr; - rsp_bufs[0].size = bufs[3].size; - rsp_bufs[0].offset = bufs[3].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.src2 = req->src2; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.src2.data = (uint32_t) bufs[2].ptr; - octx.dst.data = (uint32_t) bufs[3].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_matmul_id(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_binary(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[3].fd; - rsp_bufs[0].ptr = bufs[3].ptr; - rsp_bufs[0].offset = bufs[3].offset; - rsp_bufs[0].size = bufs[3].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.src2 = req->src2; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.src2.data = (uint32_t) bufs[2].ptr; - octx.dst.data = (uint32_t) bufs[3].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_binary(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_unary(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_sum_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_sum_rows(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_ssm_conv_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS]; - - // We've written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup OP context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_ssm_conv(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_cumsum_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We've written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[1].fd; - rsp_bufs[0].ptr = bufs[1].ptr; - rsp_bufs[0].offset = bufs[1].offset; - rsp_bufs[0].size = bufs[1].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.dst.data = (uint32_t) bufs[1].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_cumsum(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_activations_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - uint32_t n_bufs) { - struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS]; - - int write_idx = (n_bufs == 3) ? 2 : 1; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[write_idx].fd; - rsp_bufs[0].ptr = bufs[write_idx].ptr; - rsp_bufs[0].offset = bufs[write_idx].offset; - rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - if (3 == n_bufs) { - octx.src1 = req->src1; - } - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - if (3 == n_bufs) { - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - } else { - octx.dst.data = (uint32_t) bufs[1].ptr; - } - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - if (octx.op == HTP_OP_SOFTMAX) { - rsp_status = op_softmax(&octx); - } else { - rsp_status = op_activations(&octx); - } - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_rope_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - uint32_t n_bufs) { - struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS]; - - int write_idx = n_bufs - 1; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[write_idx].fd; - rsp_bufs[0].ptr = bufs[write_idx].ptr; - rsp_bufs[0].offset = bufs[write_idx].offset; - rsp_bufs[0].size = bufs[write_idx].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - if (4 == n_bufs) { - octx.src2 = req->src2; - } - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - if (4 == n_bufs) { - octx.src2.data = (uint32_t) bufs[2].ptr; - octx.dst.data = (uint32_t) bufs[3].ptr; - } else { - octx.dst.data = (uint32_t) bufs[2].ptr; - } - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_rope(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) { - struct dspqueue_buffer rsp_bufs[1]; - - // We had written to the output buffer, we'd also need to flush it - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - // Setup Op context - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.dst.data = (uint32_t) bufs[2].ptr; - octx.n_threads = ctx->n_threads; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_set_rows(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); -} - -static void proc_flash_attn_ext_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - uint32_t n_bufs) { - // Setup Op context - struct htp_ops_context octx; - memset(&octx, 0, sizeof(octx)); - - octx.ctx = ctx; - octx.n_threads = ctx->n_threads; - - octx.src0 = req->src0; - octx.src1 = req->src1; - octx.src2 = req->src2; - octx.src3 = req->src3; - octx.src4 = req->src4; - octx.dst = req->dst; - octx.flags = req->flags; - octx.op = req->op; - - memcpy(octx.op_params, req->op_params, sizeof(octx.op_params)); - - // Update data pointers - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t) bufs[1].ptr; - octx.src2.data = (uint32_t) bufs[2].ptr; - - int last_buf = 3; - - if (octx.src3.ne[0]) { - octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid - } - - if (octx.src4.ne[0]) { - octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid - } - - octx.dst.data = (uint32_t) bufs[last_buf].ptr; - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - rsp_status = op_flash_attn_ext(&octx); - vtcm_release(ctx); - } - - profile_stop(&prof); - - struct dspqueue_buffer rsp_buf = bufs[last_buf]; - rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU - - send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof); -} - -#ifdef HTP_HAS_HMX -// --------------------------------------------------------------------------- -// HMX operation wrappers — self-contained, bypass htp_ops_context / htp_spad. -// VTCM, DMA and thread dispatch are managed inside the HMX kernels. -// --------------------------------------------------------------------------- - -static void proc_hmx_matmul_req(struct htp_context * ctx, - struct htp_general_req * req, - struct dspqueue_buffer * bufs, - size_t n_bufs) { - // HMX weight tile requires N to be 32-aligned. - if (req->src0.ne[1] % 32 != 0) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - - const bool is_batched = (req->src0.ne[2] * req->src0.ne[3] > 1 || - req->src1.ne[2] * req->src1.ne[3] > 1); - - // Quantised HMX kernels only handle flat 2D matmul (host already rejects - // batched quantised, but guard here too). F16 batched matmul is handled - // by the dedicated wrapper in hmx-matmul-ops.c. - if (is_batched && - req->src0.type != HTP_TYPE_F16) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - - // HMX assumes contiguous row-major layout. Fall back for permuted - // tensors where strides are non-monotonic (e.g. transposed KV cache). - if (req->src0.nb[0] > req->src0.nb[1] || - req->src1.nb[0] > req->src1.nb[1]) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - - // M alignment: when M > 32 but not 32-aligned, we split into - // HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows). - // When M <= 32 and not 32-aligned, fall back entirely to HVX. - const int m_total = (int) req->src1.ne[1]; - const int m_tail = m_total % 32; - const int m_hmx = m_total - m_tail; - - if (m_hmx == 0) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - - // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights. - // Other types fall back to HVX. - { - uint32_t wtype = req->src0.type; - if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && - wtype != HTP_TYPE_MXFP4) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - // Quantised HMX path requires K aligned to 256 (x4x2 super-block). - // F16 HMX path requires K aligned to 32 (tile width). - if (wtype != HTP_TYPE_F16 && req->src0.ne[0] % 256 != 0) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - if (wtype == HTP_TYPE_F16 && req->src0.ne[0] % 32 != 0) { - proc_matmul_req(ctx, req, bufs, n_bufs); - return; +static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct htp_buf_desc *b) { + b->base = NULL; + + for (uint32_t i=0; immap + i; + if (m->size && m->fd == b->fd) { + b->base = m->base; + *m_reuse |= (1 << i); + return true; } } - (void) n_bufs; + return false; +} - struct dspqueue_buffer rsp_bufs[1]; - rsp_bufs[0].fd = bufs[2].fd; - rsp_bufs[0].ptr = bufs[2].ptr; - rsp_bufs[0].size = bufs[2].size; - rsp_bufs[0].offset = bufs[2].offset; - rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); - - // src0 = weights, src1 = activation, dst = output - void * wgt = (void *) bufs[0].ptr; - float * act = (float *) bufs[1].ptr; - float * dst = (float *) bufs[2].ptr; - - int k = (int) req->src0.ne[0]; // inner dimension - int n = (int) req->src0.ne[1]; // weight columns - - - struct profile_data prof; - profile_start(&prof); - - uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR; - - // --- Phase 1: HMX on the first m_hmx (32-aligned) rows --- - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - int ret = -1; - - const int ne02 = (int) req->src0.ne[2]; - const int ne03 = (int) req->src0.ne[3]; - const int ne12 = (int) req->src1.ne[2]; - const int ne13 = (int) req->src1.ne[3]; - // Row strides in elements. For compact tensors these equal k; for - // permuted attention views they can be larger, so pass the real stride. - const int act_stride = (int)(req->src1.nb[1] / sizeof(float)); - const int weight_stride = (int)(req->src0.nb[1] / sizeof(__fp16)); - - switch (req->src0.type) { - case HTP_TYPE_F16: - if (is_batched) { - hmx_matmul_w16a32_batched_params_t batch_params = { - .dst = dst, - .activation = act, - .permuted_weight = (const __fp16 *) wgt, - .m = m_hmx, - .k = k, - .n = n, - .act_stride = act_stride, - .weight_stride = weight_stride, - .dst_stride = (int)(req->dst.nb[1] / sizeof(float)), - .ne02 = ne02, - .ne03 = ne03, - .ne12 = ne12, - .ne13 = ne13, - .src0_nb2 = req->src0.nb[2], - .src0_nb3 = req->src0.nb[3], - .src1_nb2 = req->src1.nb[2], - .src1_nb3 = req->src1.nb[3], - .dst_nb2 = req->dst.nb[2], - .dst_nb3 = req->dst.nb[3], - }; - ret = hmx_mat_mul_permuted_w16a32_batched(ctx, &batch_params); - } else { - ret = hmx_mat_mul_permuted_w16a32(ctx, dst, act, - (const __fp16 *) wgt, - m_hmx, k, n, - act_stride, - weight_stride); - } - break; - default: - ret = hmx_mat_mul_permuted_qk_0_d16a32(ctx, dst, act, - (const uint8_t *) wgt, - m_hmx, k, n, (int) req->src0.type); - break; - } - - if (ret == 0) { - rsp_status = HTP_STATUS_OK; - } else { - FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret); - vtcm_release(ctx); - req->flags &= ~HTP_OPFLAGS_SKIP_QUANTIZE; - proc_matmul_req(ctx, req, bufs, n_bufs); - return; - } - vtcm_release(ctx); +static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) { + if (m->size && !m->pinned) { + FARF(HIGH, "unmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned); + HAP_munmap2((void *) m->base, m->size); + m->size = 0; + m->base = 0; + m->fd = -1; } +} - // --- Phase 2: HVX on the remaining m_tail rows --- - if (m_tail > 0 && rsp_status == HTP_STATUS_OK) { - struct htp_ops_context octx = { 0 }; - octx.ctx = ctx; - octx.src0 = req->src0; // weights: unchanged - octx.src1 = req->src1; - octx.src1.ne[1] = m_tail; // only tail rows - octx.dst = req->dst; - octx.dst.ne[1] = m_tail; // only tail rows - // Always re-quantize tail src1: HMX Phase 1 overwrites VTCM, - // so any previously cached quantized data (SKIP_QUANTIZE pipeline) - // is invalid. - octx.flags = req->flags & ~HTP_OPFLAGS_SKIP_QUANTIZE; - octx.op = req->op; - octx.n_threads = ctx->n_threads; +static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) { + if (b->base) return; // already mapped - // Offset activation and dst pointers past the HMX-processed rows. - // Use nb[1] (row stride in bytes) to compute the byte offset. - octx.src0.data = (uint32_t) bufs[0].ptr; - octx.src1.data = (uint32_t)((uint8_t *) bufs[1].ptr + (size_t) m_hmx * req->src1.nb[1]); - octx.dst.data = (uint32_t)((uint8_t *) bufs[2].ptr + (size_t) m_hmx * req->dst.nb[1]); - - FARF(HIGH, "proc_hmx_matmul: HVX tail m_tail=%d act=%p dst=%p", - m_tail, (void *)(uintptr_t) octx.src1.data, (void *)(uintptr_t) octx.dst.data); - - if (vtcm_acquire(ctx) == AEE_SUCCESS) { - uint32_t hvx_ret = op_matmul(&octx); - vtcm_release(ctx); - if (hvx_ret != HTP_STATUS_OK) { - FARF(ERROR, "HVX tail matmul failed (ret=%u)", hvx_ret); - rsp_status = HTP_STATUS_INTERNAL_ERR; + // find unused mapping + for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) { + struct htp_mmap *m = &ctx->mmap[i]; + if (!m->size) { + void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0); + if (va == (void*)-1) { + FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size); + abort(); // can't do much else at this point } - } else { - rsp_status = HTP_STATUS_INTERNAL_ERR; + + m->base = b->base = (uint64_t) va; + m->fd = b->fd; + m->size = b->size; + m->pinned = 0; + + FARF(HIGH, "mmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned); + return; + } + } +} + +static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t n_bufs) { + uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array) + uint32_t b_reuse = 0; // buf reuse count + + size_t m_vmem = 0; // mapped vmem + size_t e_vmem = 0; // extra vmem + + // See what we can reuse + for (uint32_t i=0; i < n_bufs; i++) { + struct htp_buf_desc *b = bufs + i; + if (reuse_buf(ctx, &m_reuse, b)) { b_reuse++; } else { e_vmem += b->size; } + FARF(HIGH, "prep-buf #%u : pass0 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags); + } + + if (b_reuse == n_bufs) return; // all bufs reuse existing mappings + + // See how much vmem we have mmaped right now + for (uint32_t i=0; immap[i].size; } + + FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu n-bufs %u b-reuse %u", m_vmem, e_vmem, n_bufs, b_reuse); + + if ((m_vmem + e_vmem) > HTP_OP_MAX_VMEM) { + // Drop unused mappings + for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) { + bool used = m_reuse & (1<mmap + i); } } } - profile_stop(&prof); - - send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof); + // Create missing mappings + for (uint32_t i=0; i < n_bufs; i++) { + struct htp_buf_desc *b = bufs + i; + mmap_buf(ctx, b); + FARF(HIGH, "prep-buf #%u : pass1 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags); + } } -#endif // HTP_HAS_HMX +static void prep_tensor(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t idx, struct htp_tensor *t) { + uint32_t offset = t->data; + uint32_t size = t->size; + uint32_t bi = t->bi; + + t->data = bufs[bi].base + offset; // update data to the actual pointer + + FARF(HIGH, "prep-tensor #%u: bi %u offset %u size %u data %p : %u:%u:%u:%u", idx, t->bi, offset, t->size, (void*) t->data, + t->ne[0], t->ne[1], t->ne[3], t->ne[3]); +} + +static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, struct htp_tensor *tens, uint32_t n_tens) { + for (uint32_t i=0; i < n_tens; i++) { + prep_tensor(ctx, bufs, i, tens + i); + } +} + +static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) { + memcpy(octx->op_params, op->params, sizeof(octx->op_params)); + octx->flags = op->flags; + octx->op = op->opcode; + + FARF(HIGH, "proc-op #%u: opcode %u flags 0x%x", idx, octx->op, octx->flags); + + // Prep input tensors + for (uint32_t i=0; isrc[i] == 0xffff ? NULL : tens + op->src[i]; + + octx->src[i] = src; + if (!src) continue; + + if (!(src->flags & HTP_TENSOR_FLUSHED) && (src->flags & HTP_TENSOR_COMPUTE)) { + // flush compute buffers on input + hex_l2flush((void *) src->data, src->size); + } + + FARF(HIGH, "prep-src #%u: data %p size %u : %u:%u:%u:%u", op->src[i], (void*) src->data, src->size, + src->ne[0], src->ne[1], src->ne[3], src->ne[3]); + } + + // Prep output tensor + struct htp_tensor *dst = tens + op->dst; + + octx->dst = dst; + + FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size, + dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]); + + (void) execute_op(octx); + + // flush buffers on output + hex_l2flush((void *) dst->data, dst->size); + dst->flags |= HTP_TENSOR_FLUSHED; + + FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size, + dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]); +} + +#define DSPQUEUE_POLL_TIMEOUT_USEC 100 +#define DSPQUEUE_POLL_COUNT 100 static void htp_packet_callback(dspqueue_t queue, int error, void * context) { struct htp_context * ctx = (struct htp_context *) context; - // Repeatedly read packets from the queue until it's empty. We don't - // necessarily get a separate callback for each packet, and new packets - // may arrive while we're processing the previous one. This ensures we - // keep the DSP busy as much as possible and avoid waiting for the CPU. + int err; - while (1) { - struct htp_general_req req; - uint32_t req_size; + uint32_t poll_count = DSPQUEUE_POLL_COUNT; - struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS]; - uint32_t n_bufs; - uint32_t flags; + vtcm_acquire(ctx); - // Read packet from queue - int err = dspqueue_read_noblock(queue, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(req), // Max message length - &req_size, // Message length - (uint8_t *) &req); // Message + while (!ctx->vtcm_needs_release) { + struct htp_opbatch_req req; + uint32_t r_size = sizeof(req); + struct dspqueue_buffer dbuf; + uint32_t n_dbufs = 1; + uint32_t flags = 0; + + err = dspqueue_read_noblock(queue, &flags, n_dbufs, &n_dbufs, &dbuf, r_size, &r_size, (uint8_t *) &req); if (err == AEE_EWOULDBLOCK) { - // Consumed all packets available for now - return; + if (--poll_count) { + qurt_sleep(DSPQUEUE_POLL_TIMEOUT_USEC); + continue; + } + break; } if (err != 0) { FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err); - return; + break; } - if (req_size != sizeof(req)) { - FARF(ERROR, "Invalid request size"); + if (r_size < sizeof(req) || n_dbufs != 1) { + FARF(ERROR, "invalid request : size %u n-dbufs %u", r_size, n_dbufs); continue; } - if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) { - // Host wants early notification - dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0); + const uint32_t n_bufs = req.n_bufs; + const uint32_t n_tens = req.n_tensors; + const uint32_t n_ops = req.n_ops; + + const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs; + const uint32_t t_size = sizeof(struct htp_tensor) * n_tens; + const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops; + + if (dbuf.size < b_size + t_size + o_size) { + FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size); + break; } - // Process packet based on its message type - switch (req.op) { - case HTP_OP_MUL_MAT: - if (n_bufs != 3) { - FARF(ERROR, "Bad matmul-req buffer list"); - continue; - } -#ifdef HTP_HAS_HMX - if (ctx->hmx_enabled) { - proc_hmx_matmul_req(ctx, &req, bufs, n_bufs); - } else -#endif - { - proc_matmul_req(ctx, &req, bufs, n_bufs); - } - break; + // Reset poll count for valid requests + poll_count = DSPQUEUE_POLL_COUNT; - case HTP_OP_MUL_MAT_ID: - if (n_bufs != 4) { - FARF(ERROR, "Bad matmul-id-req buffer list"); - continue; - } - proc_matmul_id_req(ctx, &req, bufs, n_bufs); - break; + uint8_t * m_ptr = dbuf.ptr; + struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size; + struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size; + struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; - case HTP_OP_MUL: - case HTP_OP_ADD: - case HTP_OP_SUB: - case HTP_OP_DIV: - if (n_bufs != 3) { - FARF(ERROR, "Bad binary-req buffer list"); - continue; - } - proc_binary_req(ctx, &req, bufs); - break; + FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", + n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size); - case HTP_OP_RMS_NORM: - case HTP_OP_SCALE: - if (n_bufs != 2) { - FARF(ERROR, "Bad unary-req buffer list"); - continue; - } + prep_op_bufs(ctx, bufs, n_bufs); + prep_tensors(ctx, bufs, tens, n_tens); - proc_unary_req(ctx, &req, bufs); - break; + struct htp_ops_context *octx = &ctx->octx; + memset(octx, 0, sizeof(*octx)); + octx->n_threads = ctx->n_threads; + octx->ctx = ctx; - case HTP_OP_SQR: - case HTP_OP_SQRT: - case HTP_OP_UNARY_NEG: - case HTP_OP_UNARY_EXP: - case HTP_OP_UNARY_SIGMOID: - case HTP_OP_UNARY_SOFTPLUS: - if (n_bufs != 2) { - FARF(ERROR, "Bad unary-req buffer list"); - continue; - } + for (uint32_t i=0; i < n_ops; i++) { + struct profile_data prof; + profile_start(&prof); - proc_unary_req(ctx, &req, bufs); - break; + proc_op_req(octx, tens, i, &ops[i]); - case HTP_OP_SUM_ROWS: - if (n_bufs != 2) { - FARF(ERROR, "Bad unary-req buffer list"); - continue; - } + profile_stop(&prof); + ops[i].prof_usecs = prof.usecs; + ops[i].prof_cycles = prof.cycles; + ops[i].prof_pkts = prof.pkts; + } - proc_sum_rows_req(ctx, &req, bufs); - break; + // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0); - case HTP_OP_UNARY_SILU: - case HTP_OP_UNARY_GELU: - if (n_bufs != 2) { - FARF(ERROR, "Bad act-req buffer list"); - continue; - } - proc_activations_req(ctx, &req, bufs, n_bufs); - break; + struct htp_opbatch_rsp rsp; + rsp.status = HTP_STATUS_OK; // FIXME - case HTP_OP_GLU_SWIGLU: - case HTP_OP_GLU_SWIGLU_OAI: - case HTP_OP_SOFTMAX: - case HTP_OP_GLU_GEGLU: - if ((n_bufs != 2) && (n_bufs != 3)) { - FARF(ERROR, "Bad act-req buffer list"); - continue; - } - proc_activations_req(ctx, &req, bufs, n_bufs); - break; - - case HTP_OP_ADD_ID: - if (n_bufs != 4) { - FARF(ERROR, "Bad add-id-req buffer list"); - continue; - } - proc_add_id_req(ctx, &req, bufs); - break; - - case HTP_OP_ROPE: - if ((n_bufs != 3) && (n_bufs != 4)) { - FARF(ERROR, "Bad rope-req buffer list"); - continue; - } - proc_rope_req(ctx, &req, bufs, n_bufs); - break; - - case HTP_OP_FLASH_ATTN_EXT: - if (!(n_bufs >= 4 && n_bufs <= 6)) { - FARF(ERROR, "Bad flash-attn-ext-req buffer list"); - continue; - } - proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs); - break; - - case HTP_OP_SET_ROWS: - if (n_bufs != 3) { - FARF(ERROR, "Bad set-rows-req buffer list"); - continue; - } - proc_set_rows_req(ctx, &req, bufs); - break; - - case HTP_OP_GET_ROWS: - if (n_bufs != 3) { - FARF(ERROR, "Bad get-rows-req buffer list"); - continue; - } - proc_get_rows_req(ctx, &req, bufs); - break; - - case HTP_OP_CPY: - if (n_bufs != 2) { - FARF(ERROR, "Bad cpy-req buffer list"); - continue; - } - proc_cpy_req(ctx, &req, bufs); - break; - - case HTP_OP_REPEAT: - if (n_bufs != 2) { - FARF(ERROR, "Bad repeat-req buffer list"); - continue; - } - proc_repeat_req(ctx, &req, bufs); - break; - - case HTP_OP_ARGSORT: - if (n_bufs != 2) { - FARF(ERROR, "Bad argsort-req buffer list"); - continue; - } - proc_argsort_req(ctx, &req, bufs); - break; - - case HTP_OP_SSM_CONV: - if (n_bufs != 3) { - FARF(ERROR, "Bad ssm-conv-req buffer list"); - continue; - } - proc_ssm_conv_req(ctx, &req, bufs); - break; - - case HTP_OP_CUMSUM: - if (n_bufs != 2) { - FARF(ERROR, "Bad cumsum-req buffer list"); - continue; - } - proc_cumsum_req(ctx, &req, bufs); - break; - - default: - FARF(ERROR, "Unknown Op %u", req.op); - break; + dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; + err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE); + if (err != 0) { + FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err); + break; } } + + vtcm_release(ctx); } diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 24b7bad6..bac06693 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -16,8 +16,9 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" #include "htp-ops.h" +#include "htp-ops.h" +#include "hmx-ops.h" #define MM_SPAD_SRC0_NROWS 16 #define MM_SPAD_SRC1_NROWS 16 @@ -1897,11 +1898,11 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * hvx_vec_store_u(&s[0], 4, rsum); } -#define htp_matmul_tensors_preamble \ - struct htp_tensor * restrict src0 = &octx->src0; \ - struct htp_tensor * restrict src1 = &octx->src1; \ - struct htp_tensor * restrict src2 = &octx->src2; \ - struct htp_tensor * restrict dst = &octx->dst; \ +#define htp_matmul_tensors_preamble \ + const struct htp_tensor * restrict src0 = octx->src[0]; \ + const struct htp_tensor * restrict src1 = octx->src[1]; \ + const struct htp_tensor * restrict src2 = octx->src[2]; \ + const struct htp_tensor * restrict dst = octx->dst; \ struct htp_spad * restrict src0_spad = &octx->src0_spad; \ struct htp_spad * restrict src1_spad = &octx->src1_spad; \ struct htp_spad * restrict dst_spad = &octx->dst_spad; \ @@ -2223,8 +2224,8 @@ struct mmid_row_mapping { static void matmul_id(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; - struct htp_tensor * restrict ids = &octx->src2; - struct htp_spad * restrict src2_spad = &octx->src2_spad; + const struct htp_tensor * restrict ids = octx->src[2]; + struct htp_spad * restrict src2_spad = &octx->src2_spad; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -2342,8 +2343,8 @@ static void matmul_id(unsigned int nth, unsigned int ith, void * data) { static void matvec_id(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; - struct htp_tensor * restrict ids = &octx->src2; - struct htp_spad * restrict src2_spad = &octx->src2_spad; + const struct htp_tensor * restrict ids = octx->src[2]; + struct htp_spad * restrict src2_spad = &octx->src2_spad; uint64_t t1, t2; t1 = HAP_perf_get_qtimer_count(); @@ -2612,7 +2613,7 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; - const struct htp_tensor * src = &octx->src1; + const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; struct htp_spad * spad = &octx->src0_spad; uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; @@ -2659,7 +2660,7 @@ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; - const struct htp_tensor * src = &octx->src1; + const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; uint32_t dst_stride = octx->src1_spad.stride; @@ -2701,7 +2702,7 @@ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; - const struct htp_tensor * src = &octx->src1; + const struct htp_tensor * src = octx->src[1]; uint8_t * restrict dst = octx->src1_spad.data; uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; uint32_t dst_stride = octx->src1_spad.stride; @@ -2800,7 +2801,7 @@ static void htp_mminit_spad(struct htp_ops_context * octx, octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; } -int op_matmul(struct htp_ops_context * octx) { +static int op_matmul_hvx(struct htp_ops_context * octx) { htp_matmul_tensors_preamble; struct htp_matmul_context mmctx_struct = {0}; @@ -2824,7 +2825,7 @@ int op_matmul(struct htp_ops_context * octx) { worker_callback_t quant_job_func; worker_callback_t matmul_job_func = src1_nrows > 1 ? matmul_2d : matvec_2d; - bool need_quant = !(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE); + bool need_quant = true; if (src0->type == HTP_TYPE_F16) { // Try optimized f16-f16 path first (src1 in VTCM) @@ -2838,7 +2839,7 @@ int op_matmul(struct htp_ops_context * octx) { // Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting). // It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul. const bool is_batched = (ne02 > 1) || (ne03 > 1); - const bool is_permuted = htp_is_permuted(&octx->src0) || htp_is_permuted(&octx->src1); + const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]); if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) { // Optimized path @@ -2915,34 +2916,172 @@ int op_matmul(struct htp_ops_context * octx) { return HTP_STATUS_VTCM_TOO_SMALL; } - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + // Place src1 spad first. We use it for dyn.quant and may reuse between ops + octx->src1_spad.data = octx->ctx->vtcm_base; + octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + + octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL; + octx->src0_spad.src = NULL; + octx->dst_spad.src = NULL; octx->src0_spad.stride = src0_row_size_padded; octx->src1_spad.stride = src1_row_size; - if (need_quant) { + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) + return HTP_STATUS_OK; + + if (need_quant && !octx->src1_spad.src) { const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); - // Cache where src1 was written so subsequent SKIP_QUANTIZE ops can find it - octx->ctx->prev_src1_spad = octx->src1_spad.data; - } else { - // SKIP_QUANTIZE: Q8 data lives at the address written by the previous - // quantize pass. The current op may have a different src0 size (e.g. - // IQ4_NL vs MXFP4), so src1_spad.data computed above could be wrong. - octx->src1_spad.data = octx->ctx->prev_src1_spad; + octx->src1_spad.src = src1; } - if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { - const uint32_t n_matmul_jobs = octx->n_threads; - worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); - } + const uint32_t n_matmul_jobs = octx->n_threads; + worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); return HTP_STATUS_OK; } +int op_matmul(struct htp_ops_context * octx) { + htp_matmul_tensors_preamble; + +#ifndef HTP_HAS_HMX + return op_matmul_hvx(octx); +#else + if (!octx->ctx->hmx_enabled) { + return op_matmul_hvx(octx); + } + + // HMX weight tile requires N to be 32-aligned. + if (src0->ne[1] % 32 != 0) { + return op_matmul_hvx(octx); + } + + // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights. + // Other types fall back to HVX. + uint32_t wtype = src0->type; + if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) { + return op_matmul_hvx(octx); + } + + // Quantised HMX path requires K aligned to 256 (x4x2 super-block). + // F16 HMX path requires K aligned to 32 (tile width). + if (wtype != HTP_TYPE_F16 && src0->ne[0] % 256 != 0) { + return op_matmul_hvx(octx); + } + + if (wtype == HTP_TYPE_F16 && src0->ne[0] % 32 != 0) { + return op_matmul_hvx(octx); + } + + const bool is_batched = (src0->ne[2] * src0->ne[3] > 1 || src1->ne[2] * src1->ne[3] > 1); + + // Quantised HMX kernels only handle flat 2D matmul (host already rejects + // batched quantised, but guard here too). F16 batched matmul is handled + // by the dedicated wrapper in hmx-matmul-ops.c. + if (is_batched && src0->type != HTP_TYPE_F16) { + return op_matmul_hvx(octx); + } + + // HMX assumes contiguous row-major layout. Fall back for permuted + // tensors where strides are non-monotonic (e.g. transposed KV cache). + if (src0->nb[0] > src0->nb[1] || src1->nb[0] > src1->nb[1]) { + return op_matmul_hvx(octx); + } + + // M alignment: when M > 32 but not 32-aligned, we split into + // HMX (first m_hmx = M & ~31 rows) + HVX (remaining m_tail rows). + // When M <= 32 and not 32-aligned, fall back entirely to HVX. + const int m_total = (int) src1->ne[1]; + const int m_tail = m_total % 32; + const int m_hmx = m_total - m_tail; + + if (m_hmx == 0) { + return op_matmul_hvx(octx); + } + + // Always re-quantize src1 since HMX kernel overwrites vtcm/spad, + // so any previously cached quantized data is invalid. + octx->src1_spad.src = NULL; + + int k = (int) src0->ne[0]; // inner dimension + int n = (int) src0->ne[1]; // weight columns + + // --- Phase 1: HMX on the first m_hmx (32-aligned) rows --- + int ret = -1; + + // Row strides in elements. For compact tensors these equal k; for + // permuted attention views they can be larger, so pass the real stride. + const int act_stride = (int)(src1->nb[1] / sizeof(float)); + const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16)); + + if (src0->type == HTP_TYPE_F16) { + if (is_batched) { + hmx_matmul_w16a32_batched_params_t batch_params = { + .dst = (float *) dst->data, + .activation = (float *) src1->data, + .permuted_weight = (const __fp16 *) src0->data, + .m = m_hmx, + .k = k, + .n = n, + .act_stride = act_stride, + .weight_stride = wgt_stride, + .dst_stride = (int) (dst->nb[1] / sizeof(float)), + .ne02 = ne02, + .ne03 = ne03, + .ne12 = ne12, + .ne13 = ne13, + .src0_nb2 = src0->nb[2], + .src0_nb3 = src0->nb[3], + .src1_nb2 = src1->nb[2], + .src1_nb3 = src1->nb[3], + .dst_nb2 = dst->nb[2], + .dst_nb3 = dst->nb[3], + }; + ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params); + } else { + ret = hmx_mat_mul_permuted_w16a32(octx->ctx, + (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data, + m_hmx, k, n, act_stride, wgt_stride); + } + } else { + ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx, + (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, + m_hmx, k, n, (int) src0->type); + } + + if (ret != 0) { + FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret); + return op_matmul(octx); + } + + // --- Phase 2: HVX on the remaining m_tail rows --- + if (m_tail > 0) { + // copy of src1 and dst + struct htp_tensor src1_tail = *src1; + struct htp_tensor dst_tail = *dst; + + src1_tail.ne[1] = m_tail; // only tail rows + dst_tail.ne[1] = m_tail; // only tail rows + + // Offset activation and dst pointers past the HMX-processed rows. + // Use nb[1] (row stride in bytes) to compute the byte offset. + src1_tail.data += (uint32_t) m_hmx * src1->nb[1]; + dst_tail.data += (uint32_t) m_hmx * dst->nb[1]; + + octx->src[1] = &src1_tail; + octx->dst = &dst_tail; + + FARF(HIGH, "hmx-matmul: HVX tail m_tail %d src1 %p dst %p", m_tail, (void *) src1_tail.data, (void *) dst_tail.data); + return op_matmul_hvx(octx); + } + + return 0; +#endif // HTP_HAS_HMX +} + int op_matmul_id(struct htp_ops_context * octx) { htp_matmul_tensors_preamble; @@ -2950,7 +3089,7 @@ int op_matmul_id(struct htp_ops_context * octx) { struct htp_matmul_context * mmctx = &mmctx_struct; mmctx->octx = octx; - struct htp_tensor * restrict ids = &octx->src2; + const struct htp_tensor * restrict ids = octx->src[2]; const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; @@ -3003,11 +3142,17 @@ int op_matmul_id(struct htp_ops_context * octx) { return HTP_STATUS_VTCM_TOO_SMALL; } - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size; + // Place src1 spad first. We use it for dyn.quant and may reuse in subseq ops. + octx->src1_spad.data = octx->ctx->vtcm_base; + octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size; + octx->src2_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.data = octx->src2_spad.data + octx->src2_spad.size; + octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL; + octx->src0_spad.src = NULL; + octx->src2_spad.src = NULL; + octx->dst_spad.src = NULL; + octx->src0_spad.stride = src0_row_size_padded; octx->src1_spad.stride = src1_row_size; @@ -3031,20 +3176,18 @@ int op_matmul_id(struct htp_ops_context * octx) { } } - // Setup worker pool callbacks - if (!(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE)) { + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) + return HTP_STATUS_OK; + + if (octx->src1_spad.src != src1) { const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); - octx->ctx->prev_src1_spad = octx->src1_spad.data; - } else { - octx->src1_spad.data = octx->ctx->prev_src1_spad; + octx->src1_spad.src = src1; } - if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { - const uint32_t n_matmul_jobs = octx->n_threads; - worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs); - } + const uint32_t n_matmul_jobs = octx->n_threads; + worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs); return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/repeat-ops.c b/ggml/src/ggml-hexagon/htp/repeat-ops.c index 5db06c92..a6f2f0ed 100644 --- a/ggml/src/ggml-hexagon/htp/repeat-ops.c +++ b/ggml/src/ggml-hexagon/htp/repeat-ops.c @@ -12,7 +12,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" struct htp_repeat_context { @@ -32,8 +32,8 @@ struct htp_repeat_context { static void repeat_job_per_thread(unsigned int nth, unsigned int ith, void * data) { const struct htp_repeat_context * rctx = (const struct htp_repeat_context *) data; struct htp_ops_context * octx = rctx->octx; - const struct htp_tensor * src = &octx->src0; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src = octx->src[0]; + const struct htp_tensor * dst = octx->dst; const uint32_t ne00 = src->ne[0]; const uint32_t ne01 = src->ne[1]; @@ -98,8 +98,8 @@ static void repeat_job_per_thread(unsigned int nth, unsigned int ith, void * dat } int op_repeat(struct htp_ops_context * octx) { - const struct htp_tensor * src0 = &octx->src0; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; // Validate that dst dims are multiples of src dims if (dst->ne[0] % src0->ne[0] != 0 || diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ecedadb0..1d8b0796 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -15,7 +15,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" // Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h @@ -253,10 +253,10 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { struct htp_rope_context * rctx = (struct htp_rope_context *) data; struct htp_ops_context * octx = rctx->octx; - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - const struct htp_tensor * src2 = &octx->src2; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * src2 = octx->src[2]; + const struct htp_tensor * dst = octx->dst; htp_rope_preamble; @@ -284,7 +284,7 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { dma_queue * dma_queue = octx->ctx->dma[ith]; const int32_t * pos = (const int32_t *) src1->data; - const float * freq_factors = src2->data ? (const float *) src2->data : NULL; + const float * freq_factors = src2 ? (const float *) src2->data : NULL; uint32_t ir = 0; uint32_t prev_i2 = (uint32_t) -1; @@ -384,10 +384,10 @@ done: static int execute_op_rope_f32(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - const struct htp_tensor * src2 = &octx->src2; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * src2 = octx->src[2]; + const struct htp_tensor * dst = octx->dst; const char * op_type = "rope-f32"; @@ -424,19 +424,16 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) { return HTP_STATUS_VTCM_TOO_SMALL; } - // Assign sizes octx->src0_spad.size_per_thread = src0_spad_per_thread; octx->dst_spad.size_per_thread = dst_spad_per_thread; octx->src0_spad.size = n_threads * src0_spad_per_thread; octx->dst_spad.size = n_threads * dst_spad_per_thread; octx->src1_spad.size = 0; - // Assign pointers - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = NULL; - octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL; + octx->src1_spad.data = NULL; octx->src1_spad.src = NULL; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->dst_spad.src = NULL; - // Fill context struct htp_rope_context rctx; memset(&rctx, 0, sizeof(struct htp_rope_context)); @@ -483,7 +480,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) { int op_rope(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - switch (octx->src0.type) { + switch (octx->src[0]->type) { case HTP_TYPE_F32: err = execute_op_rope_f32(octx); break; diff --git a/ggml/src/ggml-hexagon/htp/set-rows-ops.c b/ggml/src/ggml-hexagon/htp/set-rows-ops.c index 4b696774..0def7b40 100644 --- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c +++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c @@ -14,33 +14,37 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" -#define set_rows_preamble \ - const uint32_t ne00 = octx->src0.ne[0]; \ - const uint32_t ne01 = octx->src0.ne[1]; \ - const uint32_t ne02 = octx->src0.ne[2]; \ - const uint32_t ne03 = octx->src0.ne[3]; \ - \ - const uint32_t ne10 = octx->src1.ne[0]; \ - const uint32_t ne11 = octx->src1.ne[1]; \ - const uint32_t ne12 = octx->src1.ne[2]; \ - \ - const uint32_t nb01 = octx->src0.nb[1]; \ - const uint32_t nb02 = octx->src0.nb[2]; \ - const uint32_t nb03 = octx->src0.nb[3]; \ - \ - const uint32_t nb10 = octx->src1.nb[0]; \ - const uint32_t nb11 = octx->src1.nb[1]; \ - const uint32_t nb12 = octx->src1.nb[2]; \ - \ - const uint32_t nb1 = octx->dst.nb[1]; \ - const uint32_t nb2 = octx->dst.nb[2]; \ - const uint32_t nb3 = octx->dst.nb[3]; \ - \ - const uint32_t ne1 = octx->dst.ne[1]; \ - \ +#define set_rows_preamble \ + const uint32_t ne00 = octx->src[0]->ne[0]; \ + const uint32_t ne01 = octx->src[0]->ne[1]; \ + const uint32_t ne02 = octx->src[0]->ne[2]; \ + const uint32_t ne03 = octx->src[0]->ne[3]; \ + \ + const uint32_t ne10 = octx->src[1]->ne[0]; \ + const uint32_t ne11 = octx->src[1]->ne[1]; \ + const uint32_t ne12 = octx->src[1]->ne[2]; \ + const uint32_t ne13 = octx->src[1]->ne[3]; \ + \ + const uint32_t nb01 = octx->src[0]->nb[1]; \ + const uint32_t nb02 = octx->src[0]->nb[2]; \ + const uint32_t nb03 = octx->src[0]->nb[3]; \ + \ + const uint32_t nb10 = octx->src[1]->nb[0]; \ + const uint32_t nb11 = octx->src[1]->nb[1]; \ + const uint32_t nb12 = octx->src[1]->nb[2]; \ + \ + const uint32_t nb1 = octx->dst->nb[1]; \ + const uint32_t nb2 = octx->dst->nb[2]; \ + const uint32_t nb3 = octx->dst->nb[3]; \ + \ + const uint32_t ne0 = octx->dst->ne[0]; \ + const uint32_t ne1 = octx->dst->ne[1]; \ + const uint32_t ne2 = octx->dst->ne[2]; \ + const uint32_t ne3 = octx->dst->ne[3]; \ + \ const uint32_t nr = ne01; struct htp_set_rows_context { @@ -56,12 +60,14 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da set_rows_preamble; + uint64_t qt = HAP_perf_get_qtimer_count(); + // parallelize by rows of src0 const uint32_t dr = srctx->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; - const bool is_i32 = (octx->src1.type == HTP_TYPE_I32); + const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); for (uint32_t i03 = 0; i03 < ne03; ++i03) { for (uint32_t i02 = 0; i02 < ne02; ++i02) { @@ -70,7 +76,7 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11); const uint32_t i10 = i; - const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12; + const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12; uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr; if (i1 >= ne1) { @@ -78,14 +84,18 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da continue; } - const uintptr_t src0_ptr = octx->src0.data + i*nb01 + i02*nb02 + i03*nb03; - const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3; + const uintptr_t src0_ptr = octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03; + const uintptr_t dst_ptr = octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3; // copy row hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); } } } + + qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); + FARF(HIGH, "set-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt); } static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) { @@ -94,12 +104,14 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da set_rows_preamble; + uint64_t qt = HAP_perf_get_qtimer_count(); + // parallelize by rows of src0 const uint32_t dr = srctx->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; - const bool is_i32 = (octx->src1.type == HTP_TYPE_I32); + const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); for (uint32_t i03 = 0; i03 < ne03; ++i03) { for (uint32_t i02 = 0; i02 < ne02; ++i02) { @@ -108,7 +120,7 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11); const uint32_t i10 = i; - const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12; + const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12; uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr; if (i1 >= ne1) { @@ -116,13 +128,17 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da continue; } - const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03; - uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3; + const uint8_t* src0_ptr = (const uint8_t *) octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03; + uint8_t* dst_ptr = (uint8_t *) octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3; hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00); } } } + + qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); + FARF(HIGH, "set-rows-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt); } int op_set_rows(struct htp_ops_context * octx) { @@ -130,15 +146,15 @@ int op_set_rows(struct htp_ops_context * octx) { const uint32_t n_threads = MIN(nr, octx->n_threads); - if (octx->src0.type != HTP_TYPE_F32) { + if (octx->src[0]->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; } - if (octx->dst.type != HTP_TYPE_F32 && octx->dst.type != HTP_TYPE_F16) { + if (octx->dst->type != HTP_TYPE_F32 && octx->dst->type != HTP_TYPE_F16) { return HTP_STATUS_NO_SUPPORT; } - if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) { + if (octx->src[1]->type != HTP_TYPE_I32 && octx->src[1]->type != HTP_TYPE_I64) { return HTP_STATUS_NO_SUPPORT; } @@ -153,7 +169,7 @@ int op_set_rows(struct htp_ops_context * octx) { srctx.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads; - switch(octx->dst.type) { + switch(octx->dst->type) { case HTP_TYPE_F32: worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_threads); break; diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c index d6356b95..d78bcc0e 100644 --- a/ggml/src/ggml-hexagon/htp/softmax-ops.c +++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c @@ -15,68 +15,89 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" -#define htp_softmax_preamble3 \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \ - const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \ - const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \ - const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \ - \ - const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \ - const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \ - const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \ - const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ +#define htp_softmax_preamble3 \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t ne10 = src1 ? src1->ne[0] : 1; \ + const uint32_t ne11 = src1 ? src1->ne[1] : 1; \ + const uint32_t ne12 = src1 ? src1->ne[2] : 1; \ + const uint32_t ne13 = src1 ? src1->ne[3] : 1; \ + \ + const uint32_t nb10 = src1 ? src1->nb[0] : 1; \ + const uint32_t nb11 = src1 ? src1->nb[1] : 1; \ + const uint32_t nb12 = src1 ? src1->nb[2] : 1; \ + const uint32_t nb13 = src1 ? src1->nb[3] : 1; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ const uint32_t nb3 = dst->nb[3]; struct htp_softmax_context { + struct htp_ops_context * octx; + bool use_f16; bool use_src1; + uint32_t n_head; uint32_t n_head_log2; - float scale; - float max_bias; - float m0; - float m1; + float scale; + float max_bias; + float m0; + float m1; - uint32_t src0_nrows_per_thread; struct fastdiv_values fastdiv_ne01; struct fastdiv_values fastdiv_ne02; struct fastdiv_values fastdiv_ne12; // For mask broadcasting struct fastdiv_values fastdiv_ne13; // For mask broadcasting - size_t spad_stride; - struct htp_ops_context * octx; + uint32_t src0_nrows_per_thread; }; +static void apply_mask(float * restrict wp0, + const float * restrict mp_f32, + const __fp16 * restrict mp_f16, + uint32_t ne00, + float slope, + bool use_f16) { + if (!mp_f32) { + return; + } + if (use_f16) { + for (uint32_t i = 0; i < ne00; ++i) { + wp0[i] += slope * (float) mp_f16[i]; + } + } else { + for (uint32_t i = 0; i < ne00; ++i) { + wp0[i] += slope * mp_f32[i]; + } + } +} + static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) { - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; memset(smctx, 0, sizeof(struct htp_softmax_context)); - memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float)); + memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float)); memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float)); smctx->n_head = src0->ne[2]; @@ -85,8 +106,8 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_ smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2); smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2); - smctx->use_src1 = (src1->ne[0] != 0); - smctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16); + smctx->use_src1 = (src1 != 0); + smctx->use_f16 = (src1 != 0) && (src1->type == HTP_TYPE_F16); smctx->octx = octx; @@ -97,8 +118,8 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_ if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01); if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02); - const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; - const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; + const uint32_t ne12 = src1 ? src1->ne[2] : 1; + const uint32_t ne13 = src1 ? src1->ne[3] : 1; if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12); if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13); @@ -139,10 +160,7 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src, } } -static void hvx_fast_softmax_f32(const uint8_t * restrict src, - uint8_t * restrict dst, - uint8_t * restrict pad, - const int num_elems) { +static void hvx_fast_softmax_f32(const uint8_t * restrict src, uint8_t * restrict dst, uint8_t * restrict pad, const int num_elems) { const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_pad = (HVX_Vector *) pad; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; @@ -188,27 +206,20 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, } } -static float hvx_softmax_f32(const uint8_t * restrict src, - uint8_t * restrict dst, - uint8_t * restrict spad, - const int num_elems, - const float max) { +static float hvx_softmax_f32(const uint8_t * restrict src, uint8_t * restrict dst, uint8_t * restrict spad, const int num_elems, const float max) { hvx_sub_scalar_f32(spad, src, max, num_elems); hvx_exp_f32(dst, spad, num_elems, false); - - float sum = hvx_reduce_sum_f32(dst, num_elems); - - return sum; + return hvx_reduce_sum_f32(dst, num_elems); } static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) { struct htp_softmax_context * smctx = (struct htp_softmax_context *) data; struct htp_ops_context * octx = smctx->octx; - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; htp_softmax_preamble3; @@ -223,22 +234,26 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) { return; } - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + uint64_t qt = HAP_perf_get_qtimer_count(); int is_aligned = 1; int opt_path = 0; + if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) { is_aligned = 0; FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n"); } + + // Only use the fast path when aligned AND row size is multiple of VLEN (128 bytes) + // The fast path (hvx_fast_softmax_f32) doesn't handle tail elements + // The non-opt path uses hvx_softmax_f32 which properly handles all sizes via its helper functions if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { opt_path = 1; } - uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride); - uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride); - uint8_t * dst_spad_data = octx->dst_spad.data + (ith * smctx->spad_stride); + uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); + uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread); + uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread); float * wp0 = (float *) src0_spad_data; float * wp1 = (float *) src1_spad_data; @@ -278,47 +293,29 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) { // ALiBi if (i2 != prev_i2) { const uint32_t h = i2; // head - - slope = (smctx->max_bias > 0.0f) ? - h < smctx->n_head_log2 ? - powf(smctx->m0, h + 1) : - powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) : - 1.0f; + slope = (smctx->max_bias > 0.0f) ? h < smctx->n_head_log2 ? powf(smctx->m0, h + 1) : powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) : 1.0f; prev_i2 = i2; } - float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03); - float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3); + float * sp = (float *) ((char *) src0->data + i1 * nb01 + i2 * nb02 + i3 * nb03); + float * dp = (float *) ((char *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3); // broadcast the mask across rows - __fp16 * mp_f16 = (smctx->use_src1) ? - (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) : - NULL; - float * mp_f32 = (smctx->use_src1) ? - (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) : - NULL; + __fp16 * mp_f16 = (smctx->use_src1) ? (__fp16 *) ((char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13) : NULL; + float * mp_f32 = (smctx->use_src1) ? (float *) ((char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13) : NULL; if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) { - hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale, - (const uint8_t *) mp_f32, slope); - } else { + hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale, (const uint8_t *) mp_f32, slope); + hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00); + } else if (1 == opt_path) { hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale); - if (mp_f32) { - if (smctx->use_f16) { - for (int i = 0; i < ne00; ++i) { - wp0[i] += slope * (float) mp_f16[i]; - } - } else { - for (int i = 0; i < ne00; ++i) { - wp0[i] += slope * mp_f32[i]; - } - } - } - } - - if (1 == opt_path) { + apply_mask(wp0, mp_f32, mp_f16, ne00, slope, smctx->use_f16); hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00); } else { + // Non-optimized path: uses HVX helper functions that properly handle all tensor sizes + // including non-multiples of 32 (the HVX vector lane count for f32) + hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale); + apply_mask(wp0, mp_f32, mp_f16, ne00, slope, smctx->use_f16); float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00); float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max); sum = sum > 0.0 ? (1.0 / sum) : 1; @@ -326,54 +323,47 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) { } } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, - smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, - ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); + FARF(HIGH, "softmax-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u : opt %u f16 %u usec %u\n", ith, nth, + ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, + ne0, ne1, ne2, ne3, opt_path, smctx->use_f16, (unsigned) qt); } static int execute_op_softmax_f32(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - const struct htp_tensor * src0 = &octx->src0; - const struct htp_tensor * src1 = &octx->src1; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; struct htp_softmax_context smctx; const char * op_type = "softmax-f32"; - switch (octx->op) { - case HTP_OP_SOFTMAX: - init_softmax_ctx(&smctx, octx); - break; - - default: - FARF(ERROR, "Unsupported Op %u\n", octx->op); - return HTP_STATUS_NO_SUPPORT; - } + init_softmax_ctx(&smctx, octx); const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const uint32_t n_threads = MIN(octx->n_threads, src0_nrows); + smctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads; + const size_t src0_row_size = src0->nb[1]; const size_t src1_row_size = src0_row_size; const size_t dst_row_size = dst->nb[1]; // VTCM scratchpads for all tensors - // N rows per thread, padded to HVX vector size - octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads; - octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads; - octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads; + // 4 rows per thread, padded to HVX vector size + octx->src0_spad.size_per_thread = hex_round_up(4 * src0_row_size, 128); + octx->src1_spad.size_per_thread = hex_round_up(4 * src1_row_size, 128); + octx->dst_spad.size_per_thread = hex_round_up(4 * dst_row_size, 128); - // Use stride for calculating offset - smctx.spad_stride = hex_round_up(src0_row_size, 128); + octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads; + octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads; + octx->dst_spad.size = octx->dst_spad.size_per_thread * n_threads; size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; - if (src1->ne[0]) { - FARF(HIGH, - "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", + if (src1) { + FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size); @@ -385,19 +375,17 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) { // Make sure the reserved vtcm size is sufficient if (octx->ctx->vtcm_size < spad_size) { - FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, - spad_size); + FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, spad_size); return HTP_STATUS_VTCM_TOO_SMALL; } - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + octx->src0_spad.data = octx->ctx->vtcm_base; octx->src0_spad.src = NULL; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL; - if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { - smctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads; - worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_threads); - } + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) return err; + + worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_threads); return err; } @@ -405,7 +393,7 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) { int op_softmax(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - switch (octx->src0.type) { + switch (octx->src[0]->type) { case HTP_TYPE_F32: err = execute_op_softmax_f32(octx); break; diff --git a/ggml/src/ggml-hexagon/htp/ssm-conv.c b/ggml/src/ggml-hexagon/htp/ssm-conv.c index 6b035810..a28fd03e 100644 --- a/ggml/src/ggml-hexagon/htp/ssm-conv.c +++ b/ggml/src/ggml-hexagon/htp/ssm-conv.c @@ -16,14 +16,14 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "hex-dma.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" #include "hvx-utils.h" -#define htp_ssm_conv_tensors_preamble \ - struct htp_tensor * restrict src0 = &octx->src0; \ - struct htp_tensor * restrict src1 = &octx->src1; \ - struct htp_tensor * restrict dst = &octx->dst; \ +#define htp_ssm_conv_tensors_preamble \ + const struct htp_tensor * restrict src0 = octx->src[0]; \ + const struct htp_tensor * restrict src1 = octx->src[1]; \ + const struct htp_tensor * restrict dst = octx->dst; \ struct htp_spad * restrict src0_spad = &octx->src0_spad; \ struct htp_spad * restrict src1_spad = &octx->src1_spad; \ struct htp_spad * restrict dst_spad = &octx->dst_spad; \ @@ -289,9 +289,9 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) { // Compute gather scratchpad size for src0 and src1 const size_t gather_spad_size = n_threads * VLEN * 2; - octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size; octx->src0_spad.src = NULL; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL; FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n", gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread, @@ -323,8 +323,9 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) { } int op_ssm_conv(struct htp_ops_context * octx) { - int err = HTP_STATUS_OK; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * dst = octx->dst; + + int err = HTP_STATUS_OK; switch (dst->type) { case HTP_TYPE_F32: diff --git a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c index 352650b6..874c41ab 100644 --- a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +++ b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c @@ -14,13 +14,13 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" -#define sum_rows_preamble \ - struct htp_tensor *src0 = &octx->src0;\ - struct htp_tensor *dst = &octx->dst; \ - \ +#define sum_rows_preamble \ + const struct htp_tensor *src0 = octx->src[0]; \ + const struct htp_tensor *dst = octx->dst; \ + \ const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne01 = src0->ne[1]; \ const uint32_t ne02 = src0->ne[2]; \ @@ -94,7 +94,7 @@ static void sum_rows_thread_f32(unsigned int nth, unsigned int ith, void *data) int op_sum_rows(struct htp_ops_context * octx) { sum_rows_preamble; - if (octx->src0.type != HTP_TYPE_F32) { + if (octx->src[0]->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; } diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 13d28317..03eccfd5 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -16,7 +16,7 @@ #define GGML_COMMON_DECL_C #include "ggml-common.h" #include "htp-ctx.h" -#include "htp-msg.h" +#include "htp-ops.h" #include "htp-ops.h" struct htp_unary_context { @@ -267,8 +267,8 @@ static void softplus_f32(const float * restrict src, static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) { const struct htp_unary_context * uctx = (const struct htp_unary_context *) data; struct htp_ops_context * octx = uctx->octx; - const struct htp_tensor * src = &octx->src0; - const struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src = octx->src[0]; + const struct htp_tensor * dst = octx->dst; htp_unary_preamble; @@ -387,8 +387,8 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * static int execute_op_unary_f32(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - const struct htp_tensor * src0 = &octx->src0; - struct htp_tensor * dst = &octx->dst; + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; const char * op_type = NULL; @@ -490,7 +490,7 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { int op_unary(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; - switch (octx->src0.type) { + switch (octx->src[0]->type) { case HTP_TYPE_F32: err = execute_op_unary_f32(octx); break;