forked from wylab/llama.cpp
9aa2807769
* hexagon: introduce op request batching and rewrite buffer managment The host now prepares batches of requests and dispatches them via a single dspqueue message. Buffers are mapped explicitly by NPU while processing batches. * hex-dma: disable l2 bypass since to work around new issue due to no flushes between Ops * hex-utils: add explicit l2flush and l2clear helpers * hex-opreq: use fine-grain per tensor l2 management * hex-opreq: avoid redundant invalidates for tensors we already flushed * hex-opreq: update debug messages * htp-opreq: reuse ops_context * hex-opreq: do not flush or invalidate cache lines beyond buffer boundry * hex-opreq: fix errors in log message * Revert "hex-opreq: do not flush or invalidate cache lines beyond buffer boundry" This reverts commit 8b7f0a55a750a6430ce4eb1874c7feb3d720056d. * hexagon: limit l2 flushes to 1MB which covers l2 cache * hex-opreq: limit cache flush to 4MB Looks like 4MB cont. vitual space should cover the 1MB cache. * hexagon: drop cache flush size to 2MB * hex-opreq: start reworking opreq packing * hex-opreq: introduce new way of packing opbatch where tensors are stored separately * hex-opreq: add a simple fastrpc call to force unmap all buffers * hex-l2flush: somehow 2MB does not seem robust, also cleanup step size to use line-size * hex-opreq: bump opreq batch size to 256 * hex-mm: place src1 spad at the top of vtcm for easy reuse * hex-ops: introduce internal types and disable src1 reuse for now Nothing new just formalizing the repack / qyn.quant types we've been using. * htp-opreq: use tensor pointers instead of copies * hex-opreq: introduce more robust way for tracking vtcm/spad reuse This removes the SKIP_QUANTIZE flag that became fragile with the addition of HMX and other ops. * hex-cumsum: fix error post opreq merge * hex-opreq: move request batch handling into the session Prepping everything for using dspqueue buffers and doing that inside the session is much cleaner. * hex-mm: yet another fix for src1 reuse when we're mixing hmx/hvx * hex-bufs: introduce pinned mmapings and use non-pinned ones for model buffers * hex-buf: add support for allocating shared/pinned buffer for opreqs * hex-opbatch: make opbatches configurable * hex-naming: better name for ggml_hexagon_shared_buffer * hex-naming: add session->c_name() helper * hex-opbatch: start using shm but still copy for now * hex-opbatch: use shared buffer for packing opbatch * hex-opbatch: beter naming for opbatch related classes and code * hex-opbatch: reuse batched tensors with same data/dims/strides * hex-opbatch: update logging * hex-opbatch: add support for vmem limit for op batching * hex-opbatch: update htp side to properly support dynamic mmap/unmap * hex-opbatch: add OB and OQ params for run-completion script and fix the asserts in batch processing * hex-opbatch: fixed src1 handling in act ops * hex-act: fix empty src1 handling in swiglu and friends Simplify preamble macro while at it * hex-mm: minor fix vtcm and dma handling in matmul cleaning up some left-overs from merges * hex-opbatch: allocate extra 1KB for dspqueue overhead * hexagon: fix softmax for non-aligned tensors and cleanup vtcm alloc * hex-mm: properly handle hmx_disabled flag * hex-ops: update comments * hex-ops: add debug output for get/set-rows * hex-mmap: optimize un/mapping of buffers * hex-opreq: global cache flush and invalidate beyond 128KB threshold * hex-ops: add super simple opfilter regex for debugging If an Op matches the regex hex backend will reject it. * hex-opbatch: wireup newer ops missed in merge and update main switch to detect this in future * hexagon: improved vtcm acquision to remove inter-op overhead Fully compatible with QNN-HTP coex * hex-mm: fixed hvx fallback path * hex-mm: lower the vmem threshold a bit further to ~3GB * hexagon: update debug & error logs This also fixes an issue with newer llvm merging repack and non-repack functions. We use those pointer to distinguish between buffer types. * hexagon: move ops context into main context Just a cleanup. We don't need separate contexts at this point. * hex-opbatch: cleanup naming and headers for opbatch and related descriptors * hex-fa: it's now better to enable FA during TG to reduce graph splits * hexagon: remove GGML_HEXAGON_EXPERIMENTAL env var It's no longer useful. Please use more flexible GGML_HEXAGON_OPFILTER to disable Ops if needed for debugging or validation. * hexagon: fixed editorconfig check * Update ggml/src/ggml-hexagon/ggml-hexagon.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
93 lines
2.4 KiB
C
93 lines
2.4 KiB
C
#ifndef HEX_UTILS_H
|
|
#define HEX_UTILS_H
|
|
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <qurt_memory.h>
|
|
|
|
#include "hexagon_types.h"
|
|
#include "hexagon_protos.h"
|
|
|
|
#include "hex-fastdiv.h"
|
|
#include "hex-dump.h"
|
|
|
|
#ifndef MAX
|
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
#endif
|
|
|
|
#ifndef MIN
|
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
#endif
|
|
|
|
static inline uint64_t hex_get_cycles() {
|
|
uint64_t cycles = 0;
|
|
asm volatile(" %0 = c15:14\n" : "=r"(cycles));
|
|
return cycles;
|
|
}
|
|
|
|
static inline uint64_t hex_get_pktcnt() {
|
|
uint64_t pktcnt;
|
|
asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
|
|
return pktcnt;
|
|
}
|
|
|
|
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
|
return (num + den - 1) / den;
|
|
}
|
|
|
|
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
|
return ((size_t) addr & (align - 1)) == 0;
|
|
}
|
|
|
|
static inline size_t hex_align_up(size_t v, size_t align) {
|
|
return hmx_ceil_div(v, align) * align;
|
|
}
|
|
|
|
static inline size_t hex_align_down(size_t v, size_t align) {
|
|
return (v / align) * align;
|
|
}
|
|
|
|
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
|
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
|
uint32_t right_off = left_off + n;
|
|
return right_off <= chunk_size;
|
|
}
|
|
|
|
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
|
return m * ((n + m - 1) / m);
|
|
}
|
|
|
|
static inline size_t hex_smin(size_t a, size_t b) {
|
|
return a < b ? a : b;
|
|
}
|
|
|
|
static inline size_t hex_smax(size_t a, size_t b) {
|
|
return a > b ? a : b;
|
|
}
|
|
|
|
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
|
|
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
|
|
Q6_l2fetch_AP((void *) p, control);
|
|
}
|
|
|
|
#define HEX_L2_LINE_SIZE 64
|
|
#define HEX_L2_FLUSH_SIZE (128 * 1024)
|
|
|
|
static inline void hex_l2flush(void * addr, size_t size)
|
|
{
|
|
if (size > HEX_L2_FLUSH_SIZE) {
|
|
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
|
|
} else {
|
|
const uint32_t s = (uint32_t) addr;
|
|
const uint32_t e = s + size;
|
|
for (uint32_t i = s; i < e; i += HEX_L2_LINE_SIZE * 4) {
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 0);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 1);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 2);
|
|
Q6_dccleaninva_A((void *) i + HEX_L2_LINE_SIZE * 3);
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif /* HEX_UTILS_H */
|