mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-03 02:57:42 +02:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| cd8370b408 | |||
| d21a76ac38 | |||
| 4fcd87cf7c | |||
| b78db3bd50 | |||
| 142df17c9c | |||
| e509411cf1 | |||
| 7cba58bbea | |||
| 5449367b21 | |||
| 1d594c295c | |||
| eec1e33a9e | |||
| 879d673759 | |||
| 6ab4e50d9c | |||
| 2336cc4784 | |||
| e6923caaec | |||
| 3e18dba9fd |
@@ -50,6 +50,7 @@ WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
|
||||
+14
-1
@@ -694,6 +694,12 @@ static bool is_autoy(const std::string & value) {
|
||||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// default values specific to example
|
||||
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
|
||||
if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
params.use_jinja = true;
|
||||
}
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
@@ -2488,11 +2494,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--jinja"},
|
||||
"use jinja template for chat (default: disabled)",
|
||||
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.use_jinja = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--no-jinja"},
|
||||
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.use_jinja = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-format"}, "FORMAT",
|
||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||
|
||||
@@ -224,7 +224,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}")
|
||||
string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
|
||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
||||
set(ARM_FEATURE "HAVE_${feature}")
|
||||
check_cxx_source_compiles(
|
||||
|
||||
@@ -33,10 +33,12 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
@@ -44,12 +46,14 @@
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
@@ -58,11 +62,14 @@
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
@@ -74,10 +81,12 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
@@ -85,6 +94,7 @@
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
@@ -99,10 +109,12 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
@@ -110,6 +122,7 @@
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
@@ -132,15 +145,18 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
@@ -161,10 +177,12 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
@@ -172,6 +190,7 @@
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
@@ -194,10 +213,12 @@
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
@@ -205,6 +226,7 @@
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
|
||||
@@ -497,6 +497,140 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
||||
constexpr int ncols_interleaved = 8;
|
||||
constexpr int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
// 1x8 tile = 2 x 4
|
||||
float32x4_t acc_f32[col_groups];
|
||||
|
||||
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
|
||||
for (int i = 0; i < col_groups; i++) {
|
||||
acc_f32[i] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
|
||||
float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
|
||||
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
|
||||
float32x4_t sb_scale_0123 = vmulq_f32(q4_d_0, q8_d);
|
||||
float32x4_t sb_scale_4567 = vmulq_f32(q4_d_1, q8_d);
|
||||
float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
|
||||
float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
|
||||
float32x4_t sb_min_0123 = vmulq_f32(q4_dmin_0, q8_d);
|
||||
float32x4_t sb_min_4567 = vmulq_f32(q4_dmin_1, q8_d);
|
||||
|
||||
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
|
||||
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
|
||||
int32x4_t acc_lo[col_groups];
|
||||
int32x4_t acc_hi[col_groups];
|
||||
|
||||
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
|
||||
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
|
||||
int16_t bsums_arr[8];
|
||||
vst1q_s16(bsums_arr, bsums);
|
||||
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||
for (int i = 0; i < col_groups; i++) {
|
||||
acc_lo[i] = vdupq_n_s32(0);
|
||||
acc_hi[i] = vdupq_n_s32(0);
|
||||
}
|
||||
// Need scales for the low and high nibbles
|
||||
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||
int16x8_t q4sb_mins[2];
|
||||
int16x8_t q4sb_scales[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int8_t aux_q4sb[8];
|
||||
const int offset = sb * 24 + i * 12;
|
||||
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
|
||||
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
|
||||
}
|
||||
|
||||
int8x16_t q8_qs[64 / 16];
|
||||
for (int i = 0; i < 64 / 16; i++) {
|
||||
q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
|
||||
}
|
||||
|
||||
for (int c = 0; c < col_groups; c++) {
|
||||
uint8x16_t q4_cols[8];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
q4_cols[i] = vld1q_u8(q4_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
|
||||
}
|
||||
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[0], m4b)), q8_qs[0], 0);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[1], m4b)), q8_qs[0], 1);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[2], m4b)), q8_qs[0], 2);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[3], m4b)), q8_qs[0], 3);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[4], m4b)), q8_qs[1], 0);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[5], m4b)), q8_qs[1], 1);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[6], m4b)), q8_qs[1], 2);
|
||||
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[7], m4b)), q8_qs[1], 3);
|
||||
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[0], 4)), q8_qs[2], 0);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[1], 4)), q8_qs[2], 1);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[2], 4)), q8_qs[2], 2);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[3], 4)), q8_qs[2], 3);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[4], 4)), q8_qs[3], 0);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[5], 4)), q8_qs[3], 1);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[6], 4)), q8_qs[3], 2);
|
||||
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[7], 4)), q8_qs[3], 3);
|
||||
}
|
||||
|
||||
// Scales
|
||||
// row c0123 blk0 and blk1
|
||||
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
|
||||
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
|
||||
const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
|
||||
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
|
||||
acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
|
||||
// row c4567 blk0 and blk1
|
||||
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
|
||||
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
|
||||
const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
|
||||
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
|
||||
acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
|
||||
|
||||
// Bias Correction
|
||||
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
|
||||
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
|
||||
|
||||
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
||||
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
||||
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
||||
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
||||
} // for sb
|
||||
|
||||
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
|
||||
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
|
||||
} // for b
|
||||
|
||||
int base = x * ncols_interleaved;
|
||||
vst1q_f32(s + base, acc_f32[0]);
|
||||
vst1q_f32(s + base + 4, acc_f32[1]);
|
||||
} // for x
|
||||
return;
|
||||
#endif // #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
@@ -518,7 +652,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
constexpr int col_pairs = ncols_interleaved / 2;
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
@@ -615,7 +749,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
|
||||
|
||||
// 0123 or 4567
|
||||
// TODO: Single superblock mul at the end of the superblock
|
||||
float32x4_t sumf_0 =
|
||||
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
|
||||
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
|
||||
@@ -649,7 +782,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
vst1q_f32(s + base + 4, acc_f32[1]);
|
||||
} // for x
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON)
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
@@ -2069,6 +2202,206 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
||||
constexpr int ncols_interleaved = 8;
|
||||
constexpr int blocklen = 4;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
constexpr int q8_k_blocklen = 4;
|
||||
constexpr int acc_size = 2 * 4; // 2 row pairs × 4 col pairs
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
// 8 accumulators: 2 row pairs × 4 col pairs
|
||||
float32x4_t acc_f32[acc_size];
|
||||
|
||||
for (int y = 0; y < nr / q8_k_blocklen; y++) {
|
||||
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
|
||||
for (int i = 0; i < acc_size; i++) {
|
||||
acc_f32[i] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
// d4 0 1 2 3, 4 5 6 7
|
||||
float32x4_t q4_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));
|
||||
float32x4_t q4_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));
|
||||
// d8 0 1 2 3
|
||||
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
|
||||
// mins
|
||||
float32x4_t q4_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));
|
||||
float32x4_t q4_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));
|
||||
|
||||
// Precomputation of scales and mins
|
||||
float32x4_t sbd_scale_0123[q8_k_blocklen];
|
||||
float32x4_t sbd_scale_4567[q8_k_blocklen];
|
||||
float32x4_t sbd_min_0123[q8_k_blocklen];
|
||||
float32x4_t sbd_min_4567[q8_k_blocklen];
|
||||
|
||||
sbd_scale_0123[0] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 0);
|
||||
sbd_scale_4567[0] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 0);
|
||||
sbd_min_0123[0] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 0);
|
||||
sbd_min_4567[0] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 0);
|
||||
|
||||
sbd_scale_0123[1] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 1);
|
||||
sbd_scale_4567[1] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 1);
|
||||
sbd_min_0123[1] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 1);
|
||||
sbd_min_4567[1] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 1);
|
||||
|
||||
sbd_scale_0123[2] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 2);
|
||||
sbd_scale_4567[2] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 2);
|
||||
sbd_min_0123[2] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 2);
|
||||
sbd_min_4567[2] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 2);
|
||||
|
||||
sbd_scale_0123[3] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 3);
|
||||
sbd_scale_4567[3] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 3);
|
||||
sbd_min_0123[3] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 3);
|
||||
sbd_min_4567[3] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 3);
|
||||
|
||||
// Precomputation of bsums, each vpaddq calcs all the bsums for each row
|
||||
const int16x8_t bsums[q8_k_blocklen] = {
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
|
||||
};
|
||||
int16_t bsums_arr[QK_K / 64][8];
|
||||
for (int q8_row = 0; q8_row < 4; q8_row++) {
|
||||
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
|
||||
}
|
||||
|
||||
// interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
|
||||
int32x4_t bias_acc[acc_size];
|
||||
for (int i = 0; i < acc_size; i++) {
|
||||
bias_acc[i] = vdupq_n_s32(0);
|
||||
}
|
||||
|
||||
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||
// Int accumulators for qs vecdot (4 row x 2 col quartets)
|
||||
int32x4_t acc_lo[acc_size];
|
||||
int32x4_t acc_hi[acc_size];
|
||||
for (int i = 0; i < acc_size; i++) {
|
||||
acc_lo[i] = vdupq_n_s32(0);
|
||||
acc_hi[i] = vdupq_n_s32(0);
|
||||
}
|
||||
// Need scales for the low and high nibbles
|
||||
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||
int16x8_t q4sb_scales[2];
|
||||
int16x8_t q4sb_mins[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int8_t aux_q4sb[8];
|
||||
const int offset = sb * 24 + i * 12;
|
||||
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
|
||||
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
|
||||
}
|
||||
|
||||
constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
|
||||
for (int k = 0; k < reads_per_sb; k++) {
|
||||
const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
|
||||
const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
|
||||
|
||||
// 0..3 & 32..35
|
||||
const uint8x16_t q4_0123 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k);
|
||||
const uint8x16_t q4_4567 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k + 16);
|
||||
|
||||
const int8x16_t q4_0123_lo = vreinterpretq_s8_u8(vandq_u8(q4_0123, m4b));
|
||||
const int8x16_t q4_0123_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_0123, 4));
|
||||
|
||||
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q4_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
|
||||
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q4_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
|
||||
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q4_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
|
||||
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q4_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
|
||||
|
||||
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q4_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
|
||||
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q4_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
|
||||
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q4_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
|
||||
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q4_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
|
||||
|
||||
const int8x16_t q4_4567_lo = vreinterpretq_s8_u8(vandq_u8(q4_4567, m4b));
|
||||
const int8x16_t q4_4567_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_4567, 4));
|
||||
|
||||
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q4_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
|
||||
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q4_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
|
||||
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q4_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
|
||||
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q4_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
|
||||
|
||||
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q4_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
|
||||
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q4_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
|
||||
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q4_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
|
||||
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q4_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
|
||||
}
|
||||
|
||||
// Scale and bias application
|
||||
// acc is stored interleaved to match output layout
|
||||
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
|
||||
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
|
||||
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
|
||||
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
|
||||
for (int row = 0; row < q8_k_blocklen; row++) {
|
||||
// Bias correction
|
||||
// row c0123 blk0 and blk1
|
||||
const float32x4_t sumf_0123 =
|
||||
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
|
||||
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
|
||||
acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
|
||||
|
||||
// row c4567 blk0 and blk1
|
||||
const float32x4_t sumf_4567 =
|
||||
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
|
||||
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
|
||||
acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
|
||||
|
||||
// Bias
|
||||
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
|
||||
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
|
||||
|
||||
// row c0123 blk0 and blk1
|
||||
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
||||
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
||||
|
||||
// row c4567 blk0 and blk1
|
||||
bias_acc[2 * row + 1] =
|
||||
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
||||
bias_acc[2 * row + 1] =
|
||||
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
||||
}
|
||||
} // for sb
|
||||
|
||||
for (int row = 0; row < q8_k_blocklen; row++) {
|
||||
acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
|
||||
acc_f32[2 * row + 1] =
|
||||
vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
|
||||
}
|
||||
} // for b
|
||||
|
||||
for (int i = 0; i < q8_k_blocklen; i++) {
|
||||
int row = y * q8_k_blocklen + i;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
int col = x * ncols_interleaved + j * 4;
|
||||
int offset = row * bs + col;
|
||||
vst1q_f32(s + offset, acc_f32[2 * i + j]);
|
||||
}
|
||||
}
|
||||
} // for x
|
||||
} // for y
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
|
||||
@@ -124,6 +124,58 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK_K == 256);
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
||||
|
||||
// scalar
|
||||
const int blck_size_interleave = 4;
|
||||
float srcv[4][QK_K];
|
||||
float iscale[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0;
|
||||
|
||||
for (int j = 0; j < QK_K; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
||||
// Update the maximum value of the corresponding super block
|
||||
if(amax < fabsf(srcv[row_iter][j])) {
|
||||
amax = fabsf(srcv[row_iter][j]);
|
||||
max = srcv[row_iter][j];
|
||||
}
|
||||
}
|
||||
|
||||
iscale[row_iter] = amax ? -127.f/max : 0;
|
||||
|
||||
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK_K / 4; j++) {
|
||||
y[i].bsums[j] = 0;
|
||||
}
|
||||
|
||||
// Quants values are interleaved in sequence of four bytes from corresponding super blocks
|
||||
// Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
|
||||
// i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
|
||||
for (int j = 0; j < QK_K * 4; j++) {
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
||||
y[i].qs[j] = nearest_int(x0);
|
||||
y[i].bsums[index] += y[i].qs[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK_K == 256);
|
||||
assert(k % QK_K == 0);
|
||||
@@ -192,6 +244,12 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTR
|
||||
ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
|
||||
}
|
||||
|
||||
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
||||
assert(nrow == 4);
|
||||
UNUSED(nrow);
|
||||
ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
|
||||
}
|
||||
|
||||
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
||||
assert(nrow == 4);
|
||||
UNUSED(nrow);
|
||||
@@ -333,6 +391,77 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 4;
|
||||
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
static const uint32_t kmask3 = 0x03030303;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[8];
|
||||
float sum_minf[8];
|
||||
uint32_t utmp[32];
|
||||
int sumi1;
|
||||
int sumi2;
|
||||
int sumi;
|
||||
|
||||
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[j] = 0.0;
|
||||
sum_minf[j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
||||
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
||||
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
||||
utmp[sb * 4 + 2] = uaux_0;
|
||||
utmp[sb * 4 + 0] &= kmask1;
|
||||
}
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
||||
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
||||
sumi1 = sumi1 * scales_0[j];
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
@@ -727,6 +856,89 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 4;
|
||||
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
static const uint32_t kmask3 = 0x03030303;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[4][8];
|
||||
float sum_minf[4][8];
|
||||
uint32_t utmp[32];
|
||||
int sumi1;
|
||||
int sumi2;
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[m][j] = 0.0;
|
||||
sum_minf[m][j] = 0.0;
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
||||
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
||||
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
||||
utmp[sb * 4 + 2] = uaux_0;
|
||||
utmp[sb * 4 + 0] &= kmask1;
|
||||
}
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
||||
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
||||
sumi1 = sumi1 * scales_0[j];
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
||||
for(int m = 0; m < 4; m++) {
|
||||
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||
for(int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
@@ -1228,9 +1440,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
|
||||
constexpr int nrows_interleaved = 8;
|
||||
|
||||
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
|
||||
@@ -1468,6 +1681,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
||||
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
@@ -1501,6 +1718,10 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
||||
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
@@ -1529,6 +1750,10 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
|
||||
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
@@ -1731,12 +1956,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||
nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
||||
}
|
||||
|
||||
if (nth == 1 || nchunk0 < nth || disable_chunking) {
|
||||
int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
// Only increase nchunk0 to nth if it won't make chunks too small
|
||||
if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
|
||||
nchunk0 = nth;
|
||||
dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
}
|
||||
|
||||
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
|
||||
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
|
||||
// This prevents creating too many tiny chunks that could overlap after alignment
|
||||
const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
||||
@@ -1930,6 +2156,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||
|
||||
// instance for Q4_K
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||
|
||||
// instance for Q2
|
||||
@@ -1966,6 +2195,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &q4_K_8x4_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
|
||||
@@ -80,10 +80,12 @@ extern "C" {
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
@@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
@@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
@@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
+90
-91
@@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
int np = (n & ~(ggml_f16_step - 1));
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
||||
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
||||
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
||||
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
||||
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
||||
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
||||
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
||||
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
||||
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
||||
}
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
hy = svmad_f16_x(pg, hx, vx, hy);
|
||||
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
||||
}
|
||||
np = n;
|
||||
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
||||
const int np = n;
|
||||
_Float16 hv = (_Float16)v;
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e16m8(n - i);
|
||||
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
|
||||
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
|
||||
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
|
||||
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
|
||||
}
|
||||
#elif defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
hy = svmad_f16_x(pg, hx, vx, hy);
|
||||
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
||||
}
|
||||
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int np = 0;
|
||||
#endif
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// xs and vs are byte strides of x and v
|
||||
|
||||
+20
-11
@@ -437,18 +437,27 @@ namespace ggml_cuda_mma {
|
||||
xi[0] = xs[0];
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}else if constexpr (I == 16 && J == 8) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
|
||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||
xi[1] = xs1[0];
|
||||
}else{
|
||||
} else if constexpr (std::is_same_v<T, int>) {
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
|
||||
}else if constexpr (I == 16 && J == 8) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
|
||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||
xi[1] = xs1[0];
|
||||
|
||||
}else{
|
||||
NO_DEVICE_CODE;
|
||||
}
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
}
|
||||
#else
|
||||
|
||||
@@ -3701,7 +3701,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
|
||||
const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
|
||||
const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
|
||||
const size_t nbs_ids = mmq_x*sizeof(int);
|
||||
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
|
||||
const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
|
||||
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ set(GGML_OPENCL_KERNELS
|
||||
group_norm
|
||||
im2col_f32
|
||||
im2col_f16
|
||||
mean
|
||||
mul_mat_Ab_Bi_8x4
|
||||
mul_mv_f16_f16
|
||||
mul_mv_f16_f32_1row
|
||||
@@ -109,6 +110,9 @@ set(GGML_OPENCL_KERNELS
|
||||
softmax_4_f16
|
||||
softmax_f32
|
||||
softmax_f16
|
||||
sqr
|
||||
sqrt
|
||||
ssm_conv
|
||||
sub
|
||||
sum_rows
|
||||
transpose
|
||||
|
||||
@@ -449,6 +449,9 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
||||
cl_kernel kernel_add_id;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
|
||||
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
|
||||
cl_kernel kernel_mean_f32;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
||||
@@ -509,6 +512,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_conv_2d_f16;
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
@@ -1552,6 +1556,66 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sqr
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "sqr.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("sqr.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sqr_cont_f32 = clCreateKernel(prog, "kernel_sqr_cont_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqr_cont_f32_4 = clCreateKernel(prog, "kernel_sqr_cont_f32_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqr_cont_f16 = clCreateKernel(prog, "kernel_sqr_cont_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqr_cont_f16_4 = clCreateKernel(prog, "kernel_sqr_cont_f16_4", &err), err));
|
||||
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sqrt
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "sqrt.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("sqrt.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sqrt_cont_f32 = clCreateKernel(prog, "kernel_sqrt_cont_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqrt_cont_f32_4 = clCreateKernel(prog, "kernel_sqrt_cont_f32_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqrt_cont_f16 = clCreateKernel(prog, "kernel_sqrt_cont_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sqrt_cont_f16_4 = clCreateKernel(prog, "kernel_sqrt_cont_f16_4", &err), err));
|
||||
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mean
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mean.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mean.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
|
||||
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sub
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -1825,6 +1889,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// ssm_conv
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "ssm_conv.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("ssm_conv.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32_4 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32_4", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -2959,6 +3041,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_ADD_ID:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
@@ -3007,6 +3093,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
|
||||
(op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
|
||||
case GGML_OP_SSM_CONV:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
|
||||
case GGML_OP_CONCAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -3075,6 +3163,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
|
||||
}
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
@@ -5193,6 +5282,224 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// Currently assumes src0 is contiguous
|
||||
int n = ggml_nelements(dst);
|
||||
if (n % 4 == 0) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sqr_cont_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sqr_cont_f16_4;
|
||||
}
|
||||
n /= 4;
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sqr_cont_f32;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sqr_cont_f16;
|
||||
}
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// Currently assumes src0 is contiguous
|
||||
int n = ggml_nelements(dst);
|
||||
if (n % 4 == 0) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sqrt_cont_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sqrt_cont_f16_4;
|
||||
}
|
||||
n /= 4;
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sqrt_cont_f32;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sqrt_cont_f16;
|
||||
}
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_mean_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)64, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
int ne01 = src0->ne[1];
|
||||
cl_ulong nb00 = src0->nb[0];
|
||||
cl_ulong nb01 = src0->nb[1];
|
||||
cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
int ne10 = src1->ne[0];
|
||||
cl_ulong nb11 = src1->nb[1];
|
||||
|
||||
int ne1 = dst->ne[1];
|
||||
int ne2 = dst->ne[2];
|
||||
cl_ulong nb0 = dst->nb[0];
|
||||
cl_ulong nb1 = dst->nb[1];
|
||||
cl_ulong nb2 = dst->nb[2];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_ssm_conv_f32_f32;
|
||||
|
||||
if (ne10 % 4 == 0) {
|
||||
kernel = backend_ctx->kernel_ssm_conv_f32_f32_4;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01, (size_t)ne1, (size_t)ne2};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (ne01 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -9091,6 +9398,24 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_sub;
|
||||
break;
|
||||
case GGML_OP_SQR:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_sqr;
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_sqrt;
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mean;
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
@@ -9192,6 +9517,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_conv_2d;
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_ssm_conv;
|
||||
break;
|
||||
case GGML_OP_CONCAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
|
||||
kernel void kernel_mean_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
int i3 = get_global_id(2);
|
||||
int i2 = get_global_id(1);
|
||||
int i1 = get_global_id(0);
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float row_sum = 0;
|
||||
|
||||
for (int i0 = 0; i0 < ne00; i0++) {
|
||||
row_sum += src_row[i0];
|
||||
}
|
||||
|
||||
dst_row[0] = row_sum / ne00;
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_sqr_cont_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = src0[gid] * src0[gid];
|
||||
}
|
||||
|
||||
kernel void kernel_sqr_cont_f32_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = src0[gid] * src0[gid];
|
||||
}
|
||||
|
||||
kernel void kernel_sqr_cont_f16(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = src0[gid] * src0[gid];
|
||||
}
|
||||
|
||||
kernel void kernel_sqr_cont_f16_4(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = src0[gid] * src0[gid];
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_sqrt_cont_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = sqrt(src0[gid]);
|
||||
}
|
||||
|
||||
kernel void kernel_sqrt_cont_f32_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = sqrt(src0[gid]);
|
||||
}
|
||||
|
||||
kernel void kernel_sqrt_cont_f16(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = convert_half(sqrt(convert_float(src0[gid])));
|
||||
}
|
||||
|
||||
kernel void kernel_sqrt_cont_f16_4(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
uint gid = get_global_id(0);
|
||||
dst[gid] = convert_half4(sqrt(convert_float4(src0[gid])));
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
kernel void kernel_ssm_conv_f32_f32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
ulong nb11,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2
|
||||
){
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int ir = get_global_id(0);
|
||||
int i2 = get_global_id(1);
|
||||
int i3 = get_global_id(2);
|
||||
|
||||
int nc = ne10;
|
||||
|
||||
global float * s = (global float *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
|
||||
global float * c = (global float *) (src1 + ir*nb11);
|
||||
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0] * c[i0];
|
||||
}
|
||||
|
||||
d[0] = sumf;
|
||||
}
|
||||
|
||||
kernel void kernel_ssm_conv_f32_f32_4(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
ulong nb11,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int ir = get_global_id(0);
|
||||
int i2 = get_global_id(1);
|
||||
int i3 = get_global_id(2);
|
||||
|
||||
int nc = ne10;
|
||||
|
||||
global float4 * s = (global float4 *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
|
||||
global float4 * c = (global float4 *) (src1 + ir*nb11);
|
||||
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i0 = 0; i0 < nc/4; ++i0) {
|
||||
sumf += dot(s[i0], c[i0]);
|
||||
}
|
||||
|
||||
d[0] = sumf;
|
||||
}
|
||||
@@ -409,6 +409,7 @@ enum shader_reduction_mode {
|
||||
// argsort pipelines for up to 1<<10 invocations per workgroup
|
||||
static constexpr uint32_t num_argsort_pipelines = 11;
|
||||
static constexpr uint32_t num_topk_moe_pipelines = 10;
|
||||
static constexpr uint32_t num_topk_pipelines = 11;
|
||||
|
||||
static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
@@ -515,6 +516,7 @@ struct vk_device_struct {
|
||||
bool single_queue;
|
||||
bool support_async;
|
||||
uint32_t subgroup_size;
|
||||
uint32_t subgroup_size_log2;
|
||||
uint32_t shader_core_count;
|
||||
bool uma;
|
||||
bool prefer_host_memory;
|
||||
@@ -704,6 +706,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16;
|
||||
vk_pipeline pipeline_argsort_f32[num_argsort_pipelines];
|
||||
vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
|
||||
vk_pipeline pipeline_topk_f32[num_topk_pipelines];
|
||||
vk_pipeline pipeline_sum_rows_f32;
|
||||
vk_pipeline pipeline_cumsum_f32;
|
||||
vk_pipeline pipeline_argmax_f32;
|
||||
@@ -1205,6 +1208,15 @@ struct vk_op_argsort_push_constants {
|
||||
uint32_t inner_end;
|
||||
};
|
||||
|
||||
struct vk_op_topk_push_constants {
|
||||
uint32_t orig_ncols;
|
||||
uint32_t ncols_input;
|
||||
uint32_t ncols_output;
|
||||
uint32_t nrows;
|
||||
uint32_t first_pass;
|
||||
uint32_t last_pass;
|
||||
};
|
||||
|
||||
struct vk_op_im2col_push_constants {
|
||||
uint64_t dst_addr;
|
||||
uint32_t batch_offset; uint32_t offset_delta;
|
||||
@@ -3965,6 +3977,23 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_argsort_large_f32[i], "argsort_large_f32_"+std::to_string(i), argsort_large_f32_len, argsort_large_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE * WG_UNROLL_FACTOR, 1, 1}, {BLOCK_SIZE, WG_UNROLL_FACTOR}, 1, true);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_topk_pipelines; ++i) {
|
||||
const uint32_t BLOCK_SIZE = 1u << i;
|
||||
const uint32_t NCOLS_PADDED_LOG2 = i;
|
||||
if (i <= device->max_workgroup_size_log2) {
|
||||
uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE +
|
||||
sizeof(int) * device->subgroup_size +
|
||||
2 * sizeof(int) +
|
||||
(BLOCK_SIZE / device->subgroup_size) * sizeof(int);
|
||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot &&
|
||||
nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) {
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size);
|
||||
} else if (2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) {
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
@@ -4336,6 +4365,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
|
||||
|
||||
device->subgroup_size = subgroup_props.subgroupSize;
|
||||
device->subgroup_size_log2 = uint32_t(log2f(float(device->subgroup_size)));
|
||||
device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
if (sm_builtins) {
|
||||
device->shader_core_count = sm_props.shaderSMCount;
|
||||
@@ -5259,7 +5289,8 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
ctx->prealloc_size_x = 0;
|
||||
ctx->prealloc_size_y = 0;
|
||||
ctx->prealloc_size_split_k = 0;
|
||||
ctx->prealloc_size_add_rms_partials = 0;
|
||||
// Fixed size of 1KB, for deterministic behavior
|
||||
ctx->prealloc_size_add_rms_partials = 1024;
|
||||
|
||||
ctx->fence = ctx->device->device.createFence({});
|
||||
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||
@@ -8656,41 +8687,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
@@ -8775,7 +8771,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
std::cerr << "), " << ggml_op_name(op) << ")");
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
||||
GGML_ASSERT(dst->buffer != nullptr);
|
||||
const uint64_t ne00 = src0->ne[0];
|
||||
const uint64_t ne01 = src0->ne[1];
|
||||
@@ -8806,22 +8801,17 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
|
||||
|
||||
vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, op_supports_incontiguous);
|
||||
vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, op_supports_incontiguous);
|
||||
vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, true);
|
||||
vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, true) : vk_subbuffer{};
|
||||
vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, true) : vk_subbuffer{};
|
||||
vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, true) : vk_subbuffer{};
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true);
|
||||
|
||||
// Compute misalignment offset for descriptors and store it in in push constants.
|
||||
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst);
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
// Single call if dimension 2 is contiguous
|
||||
GGML_ASSERT(op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1))));
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
@@ -10143,6 +10133,104 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
uint32_t ncols = src0->ne[0];
|
||||
uint32_t nrows = ggml_nrows(src0);
|
||||
uint32_t k = dst->ne[0];
|
||||
|
||||
vk_op_topk_push_constants pc { ncols, ncols, k, nrows, 0, 0 };
|
||||
|
||||
// Reserve space for ivec2 per element, double buffered
|
||||
const size_t dbl_buf_size = size_t{ncols} * nrows * 2 * sizeof(int);
|
||||
const size_t x_sz = dbl_buf_size * 2;
|
||||
uint32_t dbl_buf_index = 0;
|
||||
|
||||
if (ctx->prealloc_size_x < x_sz) {
|
||||
ctx->prealloc_size_x = x_sz;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
if (ctx->prealloc_x_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
elements[1] = std::min(nrows, ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
elements[2] = 1;
|
||||
|
||||
uint32_t num_elements = ncols;
|
||||
|
||||
// Each iteration reduces a workgroup's worth of elements down to the K
|
||||
// largest elements. Repeat until we have the top K elements.
|
||||
// Need to do at least one iteration to write out the results.
|
||||
bool done_one_iter = false;
|
||||
while (num_elements > k || !done_one_iter) {
|
||||
done_one_iter = true;
|
||||
|
||||
// Prefer going as small as num_topk_pipelines - 3 for perf reasons.
|
||||
// But if K is larger, then we need a larger workgroup
|
||||
uint32_t max_pipeline = num_topk_pipelines - 3;
|
||||
uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1;
|
||||
// require full subgroup
|
||||
min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2);
|
||||
|
||||
uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements)));
|
||||
pipeline_idx = std::min(pipeline_idx, max_pipeline);
|
||||
pipeline_idx = std::max(pipeline_idx, min_pipeline);
|
||||
|
||||
if (num_elements > (1u << pipeline_idx)) {
|
||||
// If we could finish on this loop iteration (i.e. a single workgroup)
|
||||
// then do so. It's better than the overhead of another pass.
|
||||
for (uint32_t i = pipeline_idx; i < num_topk_pipelines; ++i) {
|
||||
if (num_elements <= (1u << i)) {
|
||||
pipeline_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vk_pipeline pipeline = ctx->device->pipeline_topk_f32[pipeline_idx];
|
||||
// If the device doesn't support a pipeline this large, use smaller
|
||||
while (!pipeline) {
|
||||
pipeline_idx--;
|
||||
GGML_ASSERT(pipeline_idx >= min_pipeline);
|
||||
pipeline = ctx->device->pipeline_topk_f32[pipeline_idx];
|
||||
}
|
||||
|
||||
vk_op_topk_push_constants pc2 = pc;
|
||||
pc2.ncols_input = num_elements;
|
||||
|
||||
// Number of elements remaining after this pass
|
||||
uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]);
|
||||
|
||||
vk_subbuffer src_buf;
|
||||
vk_subbuffer dst_buf;
|
||||
|
||||
if (num_elements == ncols) {
|
||||
pc2.first_pass = 1;
|
||||
src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
|
||||
} else {
|
||||
src_buf = { ctx->prealloc_x, dbl_buf_index * dbl_buf_size, dbl_buf_size };
|
||||
}
|
||||
if (num_dst_elements == k) {
|
||||
pc2.last_pass = 1;
|
||||
dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
} else {
|
||||
dst_buf = { ctx->prealloc_x, (dbl_buf_index ^ 1) * dbl_buf_size, dbl_buf_size };
|
||||
}
|
||||
|
||||
elements[0] = num_elements;
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc2, elements);
|
||||
num_elements = num_dst_elements;
|
||||
dbl_buf_index ^= 1;
|
||||
if (num_elements > k) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
}
|
||||
ctx->prealloc_x_need_sync = true;
|
||||
}
|
||||
|
||||
static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0));
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p);
|
||||
@@ -11755,6 +11843,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
ggml_vk_argsort(ctx, compute_ctx, src0, node);
|
||||
}
|
||||
|
||||
break;
|
||||
case GGML_OP_TOP_K:
|
||||
ggml_vk_topk(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_SUM:
|
||||
ggml_vk_sum(ctx, compute_ctx, src0, node);
|
||||
@@ -12963,7 +13055,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask = 0;
|
||||
}
|
||||
|
||||
ctx->prealloc_size_add_rms_partials = std::max(ctx->prealloc_size_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
|
||||
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
@@ -13026,24 +13117,6 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
return false;
|
||||
};
|
||||
|
||||
// This function tries to reorder the graph to allow nodes to run in parallel.
|
||||
// This helps with small batches, but for large batches its a slowdown, probably
|
||||
// due to cache contention. So only reorder if the majority of nodes have few rows.
|
||||
int num_small_nodes = 0;
|
||||
int num_counted_nodes = 0;
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
if (!is_empty(graph->nodes[i]) &&
|
||||
graph->nodes[i]->op != GGML_OP_SET_ROWS) {
|
||||
if (ggml_nrows(graph->nodes[i]) <= 8) {
|
||||
num_small_nodes++;
|
||||
}
|
||||
num_counted_nodes++;
|
||||
}
|
||||
}
|
||||
if (num_small_nodes < num_counted_nodes / 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor *> new_order;
|
||||
std::vector<bool> used(graph->n_nodes, false);
|
||||
std::set<ggml_tensor *> used_node_set;
|
||||
@@ -13762,15 +13835,17 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_LOG:
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -13787,19 +13862,47 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return op->ne[0] <= (1 << device->max_workgroup_size_log2);
|
||||
}
|
||||
}
|
||||
case GGML_OP_TOP_K:
|
||||
{
|
||||
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
// We could potentially support larger, using argsort to sort the
|
||||
// whole thing. Not clear if this is needed.
|
||||
uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1;
|
||||
if (min_pipeline >= num_topk_pipelines ||
|
||||
!device->pipeline_topk_f32[min_pipeline]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_ACC:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONCAT:
|
||||
return ggml_type_size(op->src[0]->type) == ggml_type_size(GGML_TYPE_F32);
|
||||
case GGML_OP_ADD1:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32)
|
||||
|| (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_FILL:
|
||||
return op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SCALE:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ROLL:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32
|
||||
&& (!op->src[1] || (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16));
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
return true;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32
|
||||
&& ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
@@ -13814,15 +13917,27 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_ARGMAX:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_I32
|
||||
&& ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_I32;
|
||||
case GGML_OP_IM2COL:
|
||||
return ggml_is_contiguous(op->src[1])
|
||||
&& op->src[1]->type == GGML_TYPE_F32
|
||||
&& (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
case GGML_OP_IM2COL_3D:
|
||||
return op->src[1]->type == GGML_TYPE_F32
|
||||
&& (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16)
|
||||
&& op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_POOL_2D:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
return true;
|
||||
return true; // all inputs are contiguous, see ggml.c
|
||||
case GGML_OP_SSM_SCAN:
|
||||
{
|
||||
for (int i = 0; i < 6; i++) {
|
||||
@@ -13863,7 +13978,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_2D:
|
||||
@@ -14459,6 +14574,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ARGSORT) {
|
||||
tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_TOP_K) {
|
||||
tensor_clone = ggml_top_k(ggml_ctx, src_clone[0], tensor->ne[0]);
|
||||
} else if (tensor->op == GGML_OP_SUM) {
|
||||
tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SUM_ROWS) {
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
#version 450
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout(constant_id = 0) const int BLOCK_SIZE = 1024;
|
||||
layout(constant_id = 1) const int NCOLS_PADDED_LOG2 = 10;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
// Input can either be the source (A) or intermediate values (S).
|
||||
// Similarly, output can be either destination (D) or intermediate values (S).
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 0) readonly buffer S {ivec2 data_s[];};
|
||||
layout (binding = 1) writeonly buffer D {int data_d[];};
|
||||
layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint orig_ncols;
|
||||
uint ncols_input;
|
||||
uint ncols_output;
|
||||
uint nrows;
|
||||
uint first_pass;
|
||||
uint last_pass;
|
||||
} p;
|
||||
|
||||
// pairs of (gid, value)
|
||||
shared ivec2 dst_row[BLOCK_SIZE];
|
||||
|
||||
void topk(bool needs_bounds_check, const uint row) {
|
||||
const int col = int(gl_LocalInvocationID.x);
|
||||
|
||||
// initialize indices
|
||||
if (gl_GlobalInvocationID.x < p.ncols_input) {
|
||||
if (p.first_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols;
|
||||
dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
|
||||
}
|
||||
} else {
|
||||
dst_row[col] = ivec2(p.orig_ncols, 0);
|
||||
}
|
||||
barrier();
|
||||
|
||||
if (p.ncols_output == 1) {
|
||||
// Fast path for single output - just do a max reduction
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
|
||||
if (col < s) {
|
||||
ivec2 a = dst_row[col];
|
||||
ivec2 b = dst_row[col + s];
|
||||
if (a.x >= p.orig_ncols ||
|
||||
b.x < p.orig_ncols && b.y > a.y) {
|
||||
dst_row[col] = b;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
} else {
|
||||
// bitonic sort on this group of elements
|
||||
uint num_outer_loop_iters = NCOLS_PADDED_LOG2;
|
||||
for (uint k = 2, outer_idx = 0; outer_idx < num_outer_loop_iters; k *= 2, outer_idx++) {
|
||||
uint num_inner_loop_iters = outer_idx + 1;
|
||||
for (uint j = k / 2, inner_idx = 0; inner_idx < num_inner_loop_iters; j /= 2, inner_idx++) {
|
||||
const int ixj = int(col ^ j);
|
||||
|
||||
int idx_0 = (col & k) == 0 ? col : ixj;
|
||||
int idx_1 = (col & k) == 0 ? ixj : col;
|
||||
|
||||
ivec2 sh_idx_0 = dst_row[idx_0];
|
||||
ivec2 sh_idx_1 = dst_row[idx_1];
|
||||
bool idx_0_oob = needs_bounds_check ? sh_idx_0.x >= p.orig_ncols : false;
|
||||
bool idx_1_oob = needs_bounds_check ? sh_idx_1.x >= p.orig_ncols : false;
|
||||
|
||||
if ((idx_0_oob ||
|
||||
(!idx_1_oob && intBitsToFloat(sh_idx_0.y) < intBitsToFloat(sh_idx_1.y))) && (ixj > col)) {
|
||||
dst_row[idx_0] = sh_idx_1;
|
||||
dst_row[idx_1] = sh_idx_0;
|
||||
}
|
||||
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
|
||||
if (p.last_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_output;
|
||||
data_d[row_offset + col] = dst_row[col].x;
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
|
||||
data_t[row_offset + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
// Fast path for fully occupied workgroups
|
||||
if ((p.ncols_input % BLOCK_SIZE) == 0) {
|
||||
uint row = gl_WorkGroupID.y;
|
||||
while (row < p.nrows) {
|
||||
topk(false, row);
|
||||
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
|
||||
}
|
||||
} else {
|
||||
uint row = gl_WorkGroupID.y;
|
||||
while (row < p.nrows) {
|
||||
topk(true, row);
|
||||
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,199 @@
|
||||
#version 450
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout(constant_id = 0) const int BLOCK_SIZE = 1024;
|
||||
layout(constant_id = 1) const int SUBGROUP_SIZE = 32;
|
||||
layout(constant_id = 2) const int SUBGROUP_SIZE_LOG2 = 5;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
// Input can either be the source (A) or intermediate values (S).
|
||||
// Similarly, output can be either destination (D) or intermediate values (S).
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 0) readonly buffer S {ivec2 data_s[];};
|
||||
layout (binding = 1) writeonly buffer D {int data_d[];};
|
||||
layout (binding = 1) writeonly buffer T {ivec2 data_t[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint orig_ncols;
|
||||
uint ncols_input;
|
||||
uint ncols_output;
|
||||
uint nrows;
|
||||
uint first_pass;
|
||||
uint last_pass;
|
||||
} p;
|
||||
|
||||
// pairs of (gid, value)
|
||||
shared ivec2 dst_row[BLOCK_SIZE];
|
||||
|
||||
shared int counts[SUBGROUP_SIZE];
|
||||
shared int sh_min_idx;
|
||||
shared uint sh_total;
|
||||
shared uint offset_partials[BLOCK_SIZE / SUBGROUP_SIZE];
|
||||
|
||||
// Map float values to uint such that comparisons still work.
|
||||
// Positive values set the high bit, negative values are inverted.
|
||||
// +0.0 -> 0x80000000, -0.0 -> 0x7FFFFFFF are in the correct places.
|
||||
uint f2ui(float x) {
|
||||
uint y = floatBitsToUint(x);
|
||||
if ((y & 0x80000000) != 0) {
|
||||
y ^= ~0;
|
||||
} else {
|
||||
y |= 0x80000000;
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
void topk(const uint row) {
|
||||
const int tid = int(gl_LocalInvocationID.x);
|
||||
|
||||
// initialize indices
|
||||
if (gl_GlobalInvocationID.x < p.ncols_input) {
|
||||
if (p.first_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols;
|
||||
dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
|
||||
}
|
||||
} else {
|
||||
dst_row[tid] = ivec2(p.orig_ncols, 0xFF800000); // -inf
|
||||
}
|
||||
barrier();
|
||||
|
||||
if (p.ncols_output == 1) {
|
||||
// Fast path for single output - just do a max reduction
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
|
||||
if (tid < s) {
|
||||
ivec2 a = dst_row[tid];
|
||||
ivec2 b = dst_row[tid + s];
|
||||
if (a.x >= p.orig_ncols ||
|
||||
b.x < p.orig_ncols && b.y > a.y) {
|
||||
dst_row[tid] = b;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
} else {
|
||||
// Do an N-ary search to find the K-th largest value.
|
||||
// We remap the float values to be comparable as unsigned integers,
|
||||
// and split the range into 2^N smaller ranges where N is the
|
||||
// subgroup size. Count how many values are in each range, if the K-th
|
||||
// largest value is in the middle of one of thee ranges then repeat
|
||||
// and split again.
|
||||
|
||||
// Mask is the current set of bits we're searching. Shift is the LSB index.
|
||||
int shift = 32 - SUBGROUP_SIZE_LOG2;
|
||||
uint mask = ((1 << SUBGROUP_SIZE_LOG2) - 1) << shift;
|
||||
|
||||
// The current range.
|
||||
uint range_min = 0;
|
||||
uint range_max = 0xFF800000;
|
||||
// How many are above the current range, and how many we need to find.
|
||||
uint total = 0;
|
||||
uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
|
||||
|
||||
while (mask != 0) {
|
||||
barrier();
|
||||
// Initialize bucket counts to zero.
|
||||
if (tid < SUBGROUP_SIZE) {
|
||||
counts[tid] = 0;
|
||||
}
|
||||
barrier();
|
||||
// Count how many values are in each bucket.
|
||||
if (tid < p.ncols_input) {
|
||||
float y = intBitsToFloat(dst_row[tid].y);
|
||||
uint fy = f2ui(y);
|
||||
if (fy >= range_min && fy < range_max) {
|
||||
uint bucket = (fy & mask) >> shift;
|
||||
atomicAdd(counts[bucket], 1);
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
|
||||
// On the first subgroup, do a scan to count (from the top down) how
|
||||
// many elements are in the top N buckets. Find the index of the first
|
||||
// that is over the limit. Copy it to the other invocations through
|
||||
// shared memory.
|
||||
if (tid < SUBGROUP_SIZE) {
|
||||
uint partial_sum = counts[SUBGROUP_SIZE - 1 - tid];
|
||||
partial_sum = subgroupInclusiveAdd(partial_sum) + total;
|
||||
uint t = subgroupBallotFindLSB(subgroupBallot(partial_sum >= limit));
|
||||
if (tid == t) {
|
||||
sh_min_idx = int(SUBGROUP_SIZE - 1 - t);
|
||||
sh_total = partial_sum;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
int min_idx = sh_min_idx;
|
||||
total = sh_total;
|
||||
|
||||
// Update the range, and break if we've found the K-th largest.
|
||||
range_max = range_min + ((min_idx + 1) << shift);
|
||||
range_min = range_min + (min_idx << shift);
|
||||
|
||||
if (total == p.ncols_output) {
|
||||
break;
|
||||
}
|
||||
total -= counts[min_idx];
|
||||
mask >>= SUBGROUP_SIZE_LOG2;
|
||||
shift -= SUBGROUP_SIZE_LOG2;
|
||||
if (shift < 0) {
|
||||
shift = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ivec2 v = dst_row[tid];
|
||||
|
||||
// We need to compact these values to the start of the dst_row array.
|
||||
// Have each subgroup count how many items it'll store, so other
|
||||
// subgroups can compute their base offset.
|
||||
bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
|
||||
uvec4 b = subgroupBallot(top);
|
||||
uint bit_count = subgroupBallotBitCount(b);
|
||||
if ((tid % SUBGROUP_SIZE) == 0) {
|
||||
offset_partials[tid / SUBGROUP_SIZE] = bit_count;
|
||||
}
|
||||
barrier();
|
||||
|
||||
uint out_idx = 0;
|
||||
[[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
|
||||
if (i < tid / SUBGROUP_SIZE) {
|
||||
out_idx += offset_partials[i];
|
||||
}
|
||||
}
|
||||
|
||||
uint bit_count_ex = subgroupBallotExclusiveBitCount(b);
|
||||
if (top) {
|
||||
// TODO: Copy directly to the output?
|
||||
dst_row[out_idx + bit_count_ex] = v;
|
||||
}
|
||||
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
|
||||
if (p.last_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_output;
|
||||
data_d[row_offset + tid] = dst_row[tid].x;
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
|
||||
data_t[row_offset + tid] = dst_row[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint row = gl_WorkGroupID.y;
|
||||
while (row < p.nrows) {
|
||||
topk(row);
|
||||
row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
|
||||
}
|
||||
}
|
||||
@@ -913,6 +913,9 @@ void process_shaders() {
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
string_to_spv("argsort_large_f32", "argsort_large.comp", {{"A_TYPE", "float"}});
|
||||
|
||||
string_to_spv("topk_argsort_f32", "topk_argsort.comp", {{"A_TYPE", "float"}});
|
||||
string_to_spv("topk_nary_search_f32", "topk_nary_search.comp", {{"A_TYPE", "float"}});
|
||||
|
||||
string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
|
||||
|
||||
@@ -19,6 +19,11 @@ import gguf
|
||||
logger = logging.getLogger("gguf-convert-endian")
|
||||
|
||||
|
||||
def byteswap_noop(tensor, block_offs):
|
||||
# this function is used when byteswapping is not needed
|
||||
pass
|
||||
|
||||
|
||||
def byteswap_q4_0(tensor, block_offs):
|
||||
# Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.
|
||||
|
||||
@@ -55,22 +60,11 @@ def byteswap_q6_k(tensor, block_offs):
|
||||
|
||||
|
||||
byteswap_tensors = {
|
||||
gguf.GGMLQuantizationType.Q4_0: {
|
||||
"block_size": 18, # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
|
||||
"byteswap_func": byteswap_q4_0,
|
||||
},
|
||||
gguf.GGMLQuantizationType.Q8_0: {
|
||||
"block_size": 34, # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
|
||||
"byteswap_func": byteswap_q8_0,
|
||||
},
|
||||
gguf.GGMLQuantizationType.Q4_K: {
|
||||
"block_size": 144, # 144 bytes = 2 * <f16 delta scaling factor> + 140 * <int8 quant>
|
||||
"byteswap_func": byteswap_q4_k,
|
||||
},
|
||||
gguf.GGMLQuantizationType.Q6_K: {
|
||||
"block_size": 210, # 210 bytes = <f16 delta scaling factor> + 208 * <int8 quant>
|
||||
"byteswap_func": byteswap_q6_k,
|
||||
},
|
||||
gguf.GGMLQuantizationType.Q4_0: byteswap_q4_0,
|
||||
gguf.GGMLQuantizationType.Q8_0: byteswap_q8_0,
|
||||
gguf.GGMLQuantizationType.Q4_K: byteswap_q4_k,
|
||||
gguf.GGMLQuantizationType.Q6_K: byteswap_q6_k,
|
||||
gguf.GGMLQuantizationType.MXFP4: byteswap_noop,
|
||||
}
|
||||
|
||||
|
||||
@@ -135,8 +129,8 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
|
||||
|
||||
tensor.data.resize(newshape)
|
||||
|
||||
block_size = byteswap_tensors[tensor.tensor_type]["block_size"]
|
||||
byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"]
|
||||
block_size = gguf.constants.GGML_QUANT_SIZES[tensor.tensor_type][1]
|
||||
byteswap_func = byteswap_tensors[tensor.tensor_type]
|
||||
|
||||
n_blocks = len(tensor.data) // block_size
|
||||
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
|
||||
|
||||
@@ -7635,6 +7635,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
|
||||
}
|
||||
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
for (int k : {1, 2, 3, 7, 15, 100, 500, 1023, 9999}) {
|
||||
if (k <= 1<<i) {
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i), 1, 1, 1}, k));
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int k : {1, 2, 3, 7, 15}) {
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {16, 10, 10, 10}, k));
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {60, 10, 10, 10}, k));
|
||||
@@ -8032,7 +8040,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {65000, 16, 1, 1}));
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {65000, 16, 1, 1}, 40));
|
||||
for (auto k : {1, 10, 40}) {
|
||||
for (auto nrows : {1, 16}) {
|
||||
for (auto cols : {k, 1000, 65000, 200000}) {
|
||||
test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {cols, nrows, 1, 1}, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
+2
-1
@@ -1175,10 +1175,11 @@ struct clip_graph {
|
||||
cb(K, "resampler_K", -1);
|
||||
cb(V, "resampler_V", -1);
|
||||
|
||||
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
|
||||
embeddings = build_attn(
|
||||
model.mm_model_attn_o_w,
|
||||
model.mm_model_attn_o_b,
|
||||
Q, K, V, nullptr, kq_scale, -1);
|
||||
Q, K, V, nullptr, resampler_kq_scale, -1);
|
||||
cb(embeddings, "resampler_attn_out", -1);
|
||||
}
|
||||
// layernorm
|
||||
|
||||
+25
-16
@@ -30,9 +30,10 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| -------- | ----------- |
|
||||
| `-h, --help, --usage` | print usage and exit |
|
||||
| `--version` | show version and build info |
|
||||
| `-cl, --cache-list` | show list of models in cache |
|
||||
| `--completion-bash` | print source-able bash completion script for llama.cpp |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||
@@ -51,7 +52,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
||||
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_SPLIT) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
@@ -61,11 +62,12 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
|
||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
@@ -78,7 +80,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
||||
| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
|
||||
| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||
@@ -92,6 +94,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
|
||||
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
@@ -100,7 +103,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
@@ -151,7 +154,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--swa-checkpoints N` | max number of SWA checkpoints per slot to create (default: 3)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_SWA_CHECKPOINTS) |
|
||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||
@@ -165,6 +169,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
||||
| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
||||
@@ -189,13 +195,14 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
@@ -209,15 +216,17 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
|
||||
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
|
||||
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
|
||||
| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |
|
||||
| `--embd-e5-small-en-default` | use default e5-small-v2 model (note: can download weights from the internet) |
|
||||
| `--embd-gte-small-default` | use default gte-small model (note: can download weights from the internet) |
|
||||
| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
|
||||
| `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) |
|
||||
| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
|
||||
| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
|
||||
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
|
||||
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
|
||||
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
|
||||
@@ -205,6 +205,8 @@ class ServerProcess:
|
||||
server_args.append("--no-webui")
|
||||
if self.jinja:
|
||||
server_args.append("--jinja")
|
||||
else:
|
||||
server_args.append("--no-jinja")
|
||||
if self.reasoning_format is not None:
|
||||
server_args.extend(("--reasoning-format", self.reasoning_format))
|
||||
if self.reasoning_budget is not None:
|
||||
|
||||
Vendored
+16
-5
@@ -31,13 +31,16 @@ if (LLAMA_BUILD_BORINGSSL)
|
||||
|
||||
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
boringssl
|
||||
set(BORINGSSL_ARGS
|
||||
GIT_REPOSITORY ${BORINGSSL_GIT}
|
||||
GIT_TAG ${BORINGSSL_VERSION}
|
||||
PATCH_COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake"
|
||||
)
|
||||
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28)
|
||||
list(APPEND BORINGSSL_ARGS EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(boringssl ${BORINGSSL_ARGS})
|
||||
|
||||
set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
|
||||
set(SAVED_BUILD_TESTING ${BUILD_TESTING})
|
||||
@@ -45,7 +48,15 @@ if (LLAMA_BUILD_BORINGSSL)
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
set(BUILD_TESTING OFF)
|
||||
|
||||
FetchContent_MakeAvailable(boringssl)
|
||||
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28)
|
||||
FetchContent_MakeAvailable(boringssl)
|
||||
else()
|
||||
FetchContent_GetProperties(boringssl)
|
||||
if(NOT boringssl_POPULATED)
|
||||
FetchContent_Populate(boringssl)
|
||||
add_subdirectory(${boringssl_SOURCE_DIR} ${boringssl_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
|
||||
set(BUILD_TESTING ${SAVED_BUILD_TESTING})
|
||||
|
||||
-6
@@ -1,6 +0,0 @@
|
||||
# Remove bssl
|
||||
file(READ "CMakeLists.txt" content)
|
||||
string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}")
|
||||
string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}")
|
||||
string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}")
|
||||
file(WRITE "CMakeLists.txt" "${content}")
|
||||
Reference in New Issue
Block a user