forked from wylab/llama.cpp
9aa2807769
* hexagon: introduce op request batching and rewrite buffer managment The host now prepares batches of requests and dispatches them via a single dspqueue message. Buffers are mapped explicitly by NPU while processing batches. * hex-dma: disable l2 bypass since to work around new issue due to no flushes between Ops * hex-utils: add explicit l2flush and l2clear helpers * hex-opreq: use fine-grain per tensor l2 management * hex-opreq: avoid redundant invalidates for tensors we already flushed * hex-opreq: update debug messages * htp-opreq: reuse ops_context * hex-opreq: do not flush or invalidate cache lines beyond buffer boundry * hex-opreq: fix errors in log message * Revert "hex-opreq: do not flush or invalidate cache lines beyond buffer boundry" This reverts commit 8b7f0a55a750a6430ce4eb1874c7feb3d720056d. * hexagon: limit l2 flushes to 1MB which covers l2 cache * hex-opreq: limit cache flush to 4MB Looks like 4MB cont. vitual space should cover the 1MB cache. * hexagon: drop cache flush size to 2MB * hex-opreq: start reworking opreq packing * hex-opreq: introduce new way of packing opbatch where tensors are stored separately * hex-opreq: add a simple fastrpc call to force unmap all buffers * hex-l2flush: somehow 2MB does not seem robust, also cleanup step size to use line-size * hex-opreq: bump opreq batch size to 256 * hex-mm: place src1 spad at the top of vtcm for easy reuse * hex-ops: introduce internal types and disable src1 reuse for now Nothing new just formalizing the repack / qyn.quant types we've been using. * htp-opreq: use tensor pointers instead of copies * hex-opreq: introduce more robust way for tracking vtcm/spad reuse This removes the SKIP_QUANTIZE flag that became fragile with the addition of HMX and other ops. * hex-cumsum: fix error post opreq merge * hex-opreq: move request batch handling into the session Prepping everything for using dspqueue buffers and doing that inside the session is much cleaner. * hex-mm: yet another fix for src1 reuse when we're mixing hmx/hvx * hex-bufs: introduce pinned mmapings and use non-pinned ones for model buffers * hex-buf: add support for allocating shared/pinned buffer for opreqs * hex-opbatch: make opbatches configurable * hex-naming: better name for ggml_hexagon_shared_buffer * hex-naming: add session->c_name() helper * hex-opbatch: start using shm but still copy for now * hex-opbatch: use shared buffer for packing opbatch * hex-opbatch: beter naming for opbatch related classes and code * hex-opbatch: reuse batched tensors with same data/dims/strides * hex-opbatch: update logging * hex-opbatch: add support for vmem limit for op batching * hex-opbatch: update htp side to properly support dynamic mmap/unmap * hex-opbatch: add OB and OQ params for run-completion script and fix the asserts in batch processing * hex-opbatch: fixed src1 handling in act ops * hex-act: fix empty src1 handling in swiglu and friends Simplify preamble macro while at it * hex-mm: minor fix vtcm and dma handling in matmul cleaning up some left-overs from merges * hex-opbatch: allocate extra 1KB for dspqueue overhead * hexagon: fix softmax for non-aligned tensors and cleanup vtcm alloc * hex-mm: properly handle hmx_disabled flag * hex-ops: update comments * hex-ops: add debug output for get/set-rows * hex-mmap: optimize un/mapping of buffers * hex-opreq: global cache flush and invalidate beyond 128KB threshold * hex-ops: add super simple opfilter regex for debugging If an Op matches the regex hex backend will reject it. * hex-opbatch: wireup newer ops missed in merge and update main switch to detect this in future * hexagon: improved vtcm acquision to remove inter-op overhead Fully compatible with QNN-HTP coex * hex-mm: fixed hvx fallback path * hex-mm: lower the vmem threshold a bit further to ~3GB * hexagon: update debug & error logs This also fixes an issue with newer llvm merging repack and non-repack functions. We use those pointer to distinguish between buffer types. * hexagon: move ops context into main context Just a cleanup. We don't need separate contexts at this point. * hex-opbatch: cleanup naming and headers for opbatch and related descriptors * hex-fa: it's now better to enable FA during TG to reduce graph splits * hexagon: remove GGML_HEXAGON_EXPERIMENTAL env var It's no longer useful. Please use more flexible GGML_HEXAGON_OPFILTER to disable Ops if needed for debugging or validation. * hexagon: fixed editorconfig check * Update ggml/src/ggml-hexagon/ggml-hexagon.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
185 lines
6.6 KiB
C
185 lines
6.6 KiB
C
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
#pragma clang diagnostic ignored "-Wunused-function"
|
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
|
|
#include <HAP_farf.h>
|
|
#include <HAP_perf.h>
|
|
|
|
#include <math.h>
|
|
#include <string.h>
|
|
|
|
#include "hex-dma.h"
|
|
#include "hvx-utils.h"
|
|
|
|
#define GGML_COMMON_DECL_C
|
|
#include "ggml-common.h"
|
|
#include "htp-ctx.h"
|
|
#include "htp-ops.h"
|
|
#include "htp-ops.h"
|
|
|
|
#define set_rows_preamble \
|
|
const uint32_t ne00 = octx->src[0]->ne[0]; \
|
|
const uint32_t ne01 = octx->src[0]->ne[1]; \
|
|
const uint32_t ne02 = octx->src[0]->ne[2]; \
|
|
const uint32_t ne03 = octx->src[0]->ne[3]; \
|
|
\
|
|
const uint32_t ne10 = octx->src[1]->ne[0]; \
|
|
const uint32_t ne11 = octx->src[1]->ne[1]; \
|
|
const uint32_t ne12 = octx->src[1]->ne[2]; \
|
|
const uint32_t ne13 = octx->src[1]->ne[3]; \
|
|
\
|
|
const uint32_t nb01 = octx->src[0]->nb[1]; \
|
|
const uint32_t nb02 = octx->src[0]->nb[2]; \
|
|
const uint32_t nb03 = octx->src[0]->nb[3]; \
|
|
\
|
|
const uint32_t nb10 = octx->src[1]->nb[0]; \
|
|
const uint32_t nb11 = octx->src[1]->nb[1]; \
|
|
const uint32_t nb12 = octx->src[1]->nb[2]; \
|
|
\
|
|
const uint32_t nb1 = octx->dst->nb[1]; \
|
|
const uint32_t nb2 = octx->dst->nb[2]; \
|
|
const uint32_t nb3 = octx->dst->nb[3]; \
|
|
\
|
|
const uint32_t ne0 = octx->dst->ne[0]; \
|
|
const uint32_t ne1 = octx->dst->ne[1]; \
|
|
const uint32_t ne2 = octx->dst->ne[2]; \
|
|
const uint32_t ne3 = octx->dst->ne[3]; \
|
|
\
|
|
const uint32_t nr = ne01;
|
|
|
|
struct htp_set_rows_context {
|
|
struct htp_ops_context * octx;
|
|
struct fastdiv_values div_ne12;
|
|
struct fastdiv_values div_ne11;
|
|
uint32_t src0_nrows_per_thread;
|
|
};
|
|
|
|
static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
|
|
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
|
|
struct htp_ops_context * octx = srctx->octx;
|
|
|
|
set_rows_preamble;
|
|
|
|
uint64_t qt = HAP_perf_get_qtimer_count();
|
|
|
|
// parallelize by rows of src0
|
|
const uint32_t dr = srctx->src0_nrows_per_thread;
|
|
const uint32_t ir0 = dr * ith;
|
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
|
|
|
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
|
|
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
|
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
|
|
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
|
|
const uint32_t i10 = i;
|
|
|
|
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
|
|
|
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
|
if (i1 >= ne1) {
|
|
// ignore invalid indices
|
|
continue;
|
|
}
|
|
|
|
const uintptr_t src0_ptr = octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03;
|
|
const uintptr_t dst_ptr = octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3;
|
|
|
|
// copy row
|
|
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
|
}
|
|
}
|
|
}
|
|
|
|
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
|
FARF(HIGH, "set-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
|
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
|
}
|
|
|
|
static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
|
|
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
|
|
struct htp_ops_context * octx = srctx->octx;
|
|
|
|
set_rows_preamble;
|
|
|
|
uint64_t qt = HAP_perf_get_qtimer_count();
|
|
|
|
// parallelize by rows of src0
|
|
const uint32_t dr = srctx->src0_nrows_per_thread;
|
|
const uint32_t ir0 = dr * ith;
|
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
|
|
|
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
|
|
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
|
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
|
|
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
|
|
const uint32_t i10 = i;
|
|
|
|
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
|
|
|
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
|
if (i1 >= ne1) {
|
|
// ignore invalid indices
|
|
continue;
|
|
}
|
|
|
|
const uint8_t* src0_ptr = (const uint8_t *) octx->src[0]->data + i*nb01 + i02*nb02 + i03*nb03;
|
|
uint8_t* dst_ptr = (uint8_t *) octx->dst->data + i1*nb1 + i02*nb2 + i03*nb3;
|
|
|
|
hvx_copy_f16_f32_uu(dst_ptr, src0_ptr, ne00);
|
|
}
|
|
}
|
|
}
|
|
|
|
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
|
FARF(HIGH, "set-rows-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
|
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
|
}
|
|
|
|
int op_set_rows(struct htp_ops_context * octx) {
|
|
set_rows_preamble;
|
|
|
|
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
|
|
|
if (octx->src[0]->type != HTP_TYPE_F32) {
|
|
return HTP_STATUS_NO_SUPPORT;
|
|
}
|
|
|
|
if (octx->dst->type != HTP_TYPE_F32 && octx->dst->type != HTP_TYPE_F16) {
|
|
return HTP_STATUS_NO_SUPPORT;
|
|
}
|
|
|
|
if (octx->src[1]->type != HTP_TYPE_I32 && octx->src[1]->type != HTP_TYPE_I64) {
|
|
return HTP_STATUS_NO_SUPPORT;
|
|
}
|
|
|
|
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
|
return HTP_STATUS_OK;
|
|
}
|
|
|
|
struct htp_set_rows_context srctx;
|
|
srctx.octx = octx;
|
|
srctx.div_ne12 = init_fastdiv_values(ne12);
|
|
srctx.div_ne11 = init_fastdiv_values(ne11);
|
|
|
|
srctx.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
|
|
|
switch(octx->dst->type) {
|
|
case HTP_TYPE_F32:
|
|
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_threads);
|
|
break;
|
|
case HTP_TYPE_F16:
|
|
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f16_f32, &srctx, n_threads);
|
|
break;
|
|
default:
|
|
return HTP_STATUS_NO_SUPPORT;
|
|
}
|
|
|
|
return HTP_STATUS_OK;
|
|
}
|