mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-11 08:16:44 +02:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a84ab1da8d | |||
| 5743ca8092 | |||
| 412c60e473 | |||
| 6769e944c7 | |||
| cbebf61ca7 |
+9
-5
@@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
@@ -225,6 +226,14 @@ if (LLAMA_BLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
|
||||
@@ -289,11 +298,6 @@ if (LLAMA_METAL)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
|
||||
@@ -43,20 +43,15 @@ endif
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||
#OPT = -Ofast
|
||||
ifdef LLAMA_FAST
|
||||
OPT = -Ofast
|
||||
else
|
||||
OPT = -O3
|
||||
endif
|
||||
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/pull/1027
|
||||
CFLAGS += -D_XOPEN_SOURCE=600
|
||||
CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
CFLAGS += -O0 -g
|
||||
CXXFLAGS += -O0 -g
|
||||
@@ -139,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
|
||||
CFLAGS += -DGGML_USE_K_QUANTS
|
||||
CXXFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
ifdef LLAMA_QKK_64
|
||||
CFLAGS += -DGGML_QKK_64
|
||||
CXXFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
|
||||
@@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
||||
- New roadmap: https://github.com/users/ggerganov/projects/7
|
||||
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
||||
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
|
||||
|
||||
+319
-52
@@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
||||
|
||||
//================================= k-quants
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#define K_SCALE_SIZE 4
|
||||
#else
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
@@ -128,13 +134,25 @@ typedef struct {
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8];
|
||||
uint8_t qs[QK_K/4]; // nibbles / quants
|
||||
uint8_t scales[3*QK_K/64];
|
||||
half d;
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
#ifdef GGML_QKK_64
|
||||
uint8_t scales[2]; // scales, quantized with 8 bits
|
||||
#else
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
||||
#endif
|
||||
half d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
half d[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
@@ -142,15 +160,26 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
half d; // super-block scale
|
||||
int8_t scales[QK_K/16]; // block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
@@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int n = tid/32;
|
||||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const uint8_t q = x[i].qs[32*n + l];
|
||||
float * y = yy + i*QK_K + 128*n;
|
||||
|
||||
@@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
#else
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
float * y = yy + i*QK_K + 16*is + il;
|
||||
float dall = x[i].d;
|
||||
float dmin = x[i].dmin;
|
||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||
|
||||
int r = threadIdx.x/4;
|
||||
int i = blockIdx.x;
|
||||
int tid = r/2;
|
||||
int is0 = r%2;
|
||||
int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
int n = tid / 4;
|
||||
int j = tid - 4*n;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
#if QK_K == 256
|
||||
const int r = threadIdx.x/4;
|
||||
const int tid = r/2;
|
||||
const int is0 = r%2;
|
||||
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
const int n = tid / 4;
|
||||
const int j = tid - 4*n;
|
||||
|
||||
uint8_t m = 1 << (4*n + j);
|
||||
int is = 8*n + 2*j + is0;
|
||||
int shift = 2*j;
|
||||
@@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const int is = tid/16; // 0 or 1
|
||||
const int il = tid%16; // 0...15
|
||||
const int im = il/8; // 0...1
|
||||
const int in = il%8; // 0...7
|
||||
|
||||
float * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
if (is == 0) {
|
||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
} else {
|
||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
@@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
//// assume 64 threads - this is very slightly better than the one below
|
||||
//const int tid = threadIdx.x;
|
||||
//const int il = tid/16;
|
||||
//const int ir = tid%16;
|
||||
//const int is = 2*il;
|
||||
//const int n = 2;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
@@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const uint8_t * q = x[i].qs;
|
||||
float * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].d[0];
|
||||
const float m = (float)x[i].d[1];
|
||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||
@@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/16; // il is in 0...3
|
||||
@@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
const int tid = threadIdx.x;
|
||||
const uint8_t q = x[i].qs[tid];
|
||||
const int im = tid/8; // 0...3
|
||||
const int in = tid%8; // 0...7
|
||||
const int is = tid/16; // 0 or 1
|
||||
const uint8_t h = x[i].qh[in] >> im;
|
||||
const float d = x[i].d;
|
||||
float * y = yy + i*QK_K + tid;
|
||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
||||
#endif
|
||||
}
|
||||
|
||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
@@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const int ip = tid/16; // 0 or 1
|
||||
const int il = tid - 16*ip; // 0...15
|
||||
|
||||
float * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t ql = x[i].ql[16*ip + il];
|
||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
||||
const int8_t * sc = x[i].scales;
|
||||
|
||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||
@@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||
|
||||
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
|
||||
@@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||
const int s_offset = 8*im;
|
||||
const int y_offset = 128*im + l0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
uint32_t aux[4];
|
||||
const uint8_t * d = (const uint8_t *)aux;
|
||||
const uint8_t * m = (const uint8_t *)(aux + 2);
|
||||
@@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||
tmp += dall * sum1 - dmin * sum2;
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint32_t uaux[2];
|
||||
const uint8_t * d = (const uint8_t *)uaux;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
||||
|
||||
uaux[0] = s[0] & 0x0f0f0f0f;
|
||||
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
const half2 * dh = (const half2 *)&x[i].d;
|
||||
|
||||
const float2 dall = __half22float2(dh[0]);
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t ql = q[l];
|
||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
||||
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
||||
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
||||
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
||||
}
|
||||
tmp += dall.x * sum1 - dall.y * sum2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
__syncthreads();
|
||||
@@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
if (threadIdx.x == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
@@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||
|
||||
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
|
||||
@@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||
|
||||
const uint16_t s_shift = 4*im;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + y_offset;
|
||||
@@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||
tmp += d * sum;
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
||||
const int in = offset/8; // 0 or 1
|
||||
const int im = offset%8; // 0...7
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint8_t * s = x[i].scales;
|
||||
|
||||
const float dall = (float)x[i].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
||||
const uint8_t ql = q[l];
|
||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
||||
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
||||
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
||||
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
__syncthreads();
|
||||
@@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
if (threadIdx.x == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
|
||||
@@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||
@@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
float tmp = 0;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
const float d = (float)x[i].d[0];
|
||||
const float m = (float)x[i].d[1];
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
||||
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
__syncthreads();
|
||||
@@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x;
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = threadIdx.x/2; // 0...15
|
||||
const int ix = threadIdx.x%2;
|
||||
|
||||
@@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
|
||||
const uint8_t * ql1 = x[i].qs + q_offset;
|
||||
@@ -793,9 +987,32 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
||||
}
|
||||
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int im = step/8;
|
||||
const int in = step%8;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const float d = x[i].d;
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
const uint8_t h = x[i].qh[in+j] >> im;
|
||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
||||
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
||||
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
@@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
if (threadIdx.x == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
@@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||
|
||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
|
||||
@@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + step;
|
||||
const uint8_t * ql = x[i].ql + step;
|
||||
const uint8_t * qh = x[i].qh + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i+0].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
||||
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
||||
}
|
||||
tmp += sum;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
@@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
||||
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
@@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
|
||||
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
@@ -2553,6 +2819,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW;
|
||||
|
||||
+36
-30
@@ -51,21 +51,21 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q2_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q3_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q5_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope);
|
||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
#else
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||
#endif
|
||||
if (error) {
|
||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
exit(1);
|
||||
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q2_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q3_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q5_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope);
|
||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
|
||||
+329
-85
@@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
}
|
||||
}
|
||||
@@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
}
|
||||
}
|
||||
@@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(
|
||||
|
||||
//============================================ k-quants ======================================================
|
||||
|
||||
#ifndef QK_K
|
||||
#define QK_K 256
|
||||
#else
|
||||
static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
|
||||
#endif
|
||||
|
||||
#if QK_K == 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#else
|
||||
#define K_SCALE_SIZE 4
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
} block_q2_k;
|
||||
} block_q2_K;
|
||||
// 84 bytes / block
|
||||
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
half d; // super-block scale
|
||||
} block_q3_k;
|
||||
// 110 bytes / block
|
||||
#if QK_K == 64
|
||||
uint8_t scales[2];
|
||||
#else
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
||||
#endif
|
||||
half d; // super-block scale
|
||||
} block_q3_K;
|
||||
|
||||
#if QK_K == 64
|
||||
typedef struct {
|
||||
half d[2]; // super-block scales/mins
|
||||
uint8_t scales[2];
|
||||
uint8_t qs[QK_K/2]; // 4-bit quants
|
||||
} block_q4_K;
|
||||
#else
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_k;
|
||||
// 144 bytes / block
|
||||
} block_q4_K;
|
||||
#endif
|
||||
|
||||
#if QK_K == 64
|
||||
typedef struct {
|
||||
half d; // super-block scales/mins
|
||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
#else
|
||||
typedef struct {
|
||||
half d; // super-block scale for quantized scales
|
||||
half dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_k;
|
||||
} block_q5_K;
|
||||
// 176 bytes / block
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
half d; // super-block scale
|
||||
} block_q6_k;
|
||||
} block_q6_K;
|
||||
// 210 bytes / block
|
||||
|
||||
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
@@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
|
||||
//========================================== dequantization =============================
|
||||
|
||||
static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) {
|
||||
static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
|
||||
|
||||
device const uint8_t * q = x[i].qs;
|
||||
|
||||
#if QK_K == 256
|
||||
int is = 0;
|
||||
float dl, ml;
|
||||
for (int n = 0; n < QK_K; n += 128) {
|
||||
@@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
|
||||
}
|
||||
q += 32;
|
||||
}
|
||||
#else
|
||||
float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
|
||||
float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
|
||||
float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
|
||||
float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
|
||||
for (int l = 0; l < 16; ++l) {
|
||||
y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
|
||||
y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
|
||||
y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
|
||||
y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
|
||||
}
|
||||
y += QK_K;
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
|
||||
static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
@@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
|
||||
}
|
||||
q += 32;
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d_all = (float)(x[i].d);
|
||||
|
||||
device const uint8_t * q = x[i].qs;
|
||||
device const uint8_t * hm = x[i].hmask;
|
||||
|
||||
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
|
||||
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
|
||||
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
|
||||
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
|
||||
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
uint8_t h = hm[l];
|
||||
y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
|
||||
y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
|
||||
y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
|
||||
y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
|
||||
y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
|
||||
y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
|
||||
y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
|
||||
y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
|
||||
}
|
||||
y += QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
|
||||
static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
device const uint8_t * q = x[i].qs;
|
||||
|
||||
#if QK_K == 256
|
||||
const float d = x[i].d;
|
||||
const float min = x[i].dmin;
|
||||
|
||||
device const uint8_t * q = x[i].qs;
|
||||
device const uint8_t * scales = x[i].scales;
|
||||
|
||||
int is = 0;
|
||||
@@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
|
||||
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
|
||||
q += 32; is += 2;
|
||||
}
|
||||
#else
|
||||
device const uint8_t * s = x[i].scales;
|
||||
device const half2 * dh = (device const half2 *)x[i].d;
|
||||
const float2 d = (float2)dh[0];
|
||||
const float d1 = d[0] * (s[0] & 0xF);
|
||||
const float d2 = d[0] * (s[1] & 0xF);
|
||||
const float m1 = d[1] * (s[0] >> 4);
|
||||
const float m2 = d[1] * (s[1] >> 4);
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
y[l+ 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l+32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
y += QK_K;
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
|
||||
static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
#if QK_K == 256
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = (float)(x[i].d);
|
||||
@@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
|
||||
u1 <<= 2; u2 <<= 2;
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
device const uint8_t * ql = x[i].qs;
|
||||
device const uint8_t * qh = x[i].qh;
|
||||
device const int8_t * sc = x[i].scales;
|
||||
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
|
||||
y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
|
||||
y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
|
||||
y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
|
||||
y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
|
||||
y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
|
||||
y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
|
||||
y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
|
||||
}
|
||||
y += QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
|
||||
static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
#if QK_K == 256
|
||||
for (int n = 0; n < QK_K; n += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
int is = l/16;
|
||||
@@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
|
||||
qh += 32;
|
||||
sc += 8;
|
||||
}
|
||||
#else
|
||||
for (int l = 0; l < 16; ++l) {
|
||||
const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
y[l+ 0] = d * sc[0] * q1;
|
||||
y[l+16] = d * sc[1] * q2;
|
||||
y[l+32] = d * sc[2] * q3;
|
||||
y[l+48] = d * sc[3] * q4;
|
||||
}
|
||||
y += 64;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q2_k(
|
||||
kernel void kernel_get_rows_q2_K(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
@@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q2_k(
|
||||
(device const block_q2_k *) ((device char *) src0 + r*nb01),
|
||||
dequantize_row_q2_K(
|
||||
(device const block_q2_K *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q3_k(
|
||||
kernel void kernel_get_rows_q3_K(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
@@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q3_k(
|
||||
(device const block_q3_k *) ((device char *) src0 + r*nb01),
|
||||
dequantize_row_q3_K(
|
||||
(device const block_q3_K *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q4_k(
|
||||
kernel void kernel_get_rows_q4_K(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
@@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q4_k(
|
||||
(device const block_q4_k *) ((device char *) src0 + r*nb01),
|
||||
dequantize_row_q4_K(
|
||||
(device const block_q4_K *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q5_k(
|
||||
kernel void kernel_get_rows_q5_K(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
@@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q5_k(
|
||||
(device const block_q5_k *) ((device char *) src0 + r*nb01),
|
||||
dequantize_row_q5_K(
|
||||
(device const block_q5_K *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q6_k(
|
||||
kernel void kernel_get_rows_q6_K(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
@@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
dequantize_row_q6_k(
|
||||
(device const block_q6_k *) ((device char *) src0 + r*nb01),
|
||||
dequantize_row_q6_K(
|
||||
(device const block_q6_K *) ((device char *) src0 + r*nb01),
|
||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||
}
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
kernel void kernel_mul_mat_q2_k_f32(
|
||||
kernel void kernel_mul_mat_q2_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb;
|
||||
device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = tpitg.y; // 0...16
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid%4; // 0...3
|
||||
@@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||
const int y_offset = 64*il + n*ir;
|
||||
const int q_offset = 32*ip + n*ir;
|
||||
|
||||
sum[ith] = 0.0f;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
|
||||
device const uint8_t * q = x[i].qs + q_offset;
|
||||
@@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||
|
||||
device const float * y = yy + i*QK_K + y_offset;
|
||||
|
||||
//float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||
float2 s = {0.f, 0.f};
|
||||
float smin = 0;
|
||||
for (int l = 0; l < n; ++l) {
|
||||
@@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
|
||||
|
||||
}
|
||||
#else
|
||||
const int il = 4 * tpitg.x;
|
||||
|
||||
uint32_t aux[2];
|
||||
thread const uint8_t * d = (thread const uint8_t *)aux;
|
||||
thread const uint8_t * m = (thread const uint8_t *)aux + 4;
|
||||
|
||||
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const float * y = yy + i*QK_K + il;
|
||||
|
||||
const float dall = (float)x[i].d;
|
||||
const float dmin = (float)x[i].dmin;
|
||||
|
||||
device const uint32_t * a = (device const uint32_t *)x[i].scales;
|
||||
aux[0] = a[0] & 0x0f0f0f0f;
|
||||
aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
|
||||
+ y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
|
||||
+ y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
|
||||
+ y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
sum[ith] = sumf;
|
||||
|
||||
//int mask1 = (ith%4 == 0);
|
||||
//int mask2 = (ith%16 == 0);
|
||||
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//if (ith == 0) {
|
||||
// for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
// dst[r1*ne0 + r0] = sum[0];
|
||||
//}
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
// This version is slightly faster than the commented out one below,
|
||||
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
@@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q3_k_f32(
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const uint8_t m3 = 3;
|
||||
const int8_t m4 = 4;
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
|
||||
device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint8_t m3 = 3;
|
||||
const int8_t m4 = 4;
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
const int tid = tpitg.y; // expecting 16
|
||||
const int ip = tid/8; // 0 or 1
|
||||
const int il = tid/2 - 4*ip; // 0...3
|
||||
@@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||
|
||||
//sum[ith] = sumf;
|
||||
sum[ith] = sumf1 - 32.f*sumf2;
|
||||
#else
|
||||
const int il = 4 * tpitg.x; // 0, 4, 8, 12
|
||||
const int im = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||
|
||||
const float d_all = (float)(x[i].d);
|
||||
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const uint8_t * h = x[i].hmask + in;
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
|
||||
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
|
||||
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
|
||||
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
|
||||
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t hm = h[l] >> im;
|
||||
sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
|
||||
+ y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
|
||||
+ y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
|
||||
+ y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
@@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_k_f32(
|
||||
kernel void kernel_mul_mat_q4_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = tpitg.y; // 0...16
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid - 4*il;// 0...3
|
||||
@@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||
const int q_offset = 32*im + l0;
|
||||
const int y_offset = 64*im + l0;
|
||||
|
||||
sum[ith] = 0.0f;
|
||||
|
||||
uchar2 sc1, sc2, sc3, sc4;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
|
||||
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||
@@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||
|
||||
}
|
||||
#else
|
||||
uint16_t aux16[2];
|
||||
thread const uint8_t * scales = (thread const uint8_t *)aux16;
|
||||
|
||||
const int il = 4*tpitg.x;
|
||||
|
||||
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
|
||||
const float d = (float)x[i].d[0];
|
||||
const float m = (float)x[i].d[1];
|
||||
|
||||
device const uint16_t * a = (device const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
|
||||
+ d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
sum[ith] = sumf;
|
||||
|
||||
@@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||
//}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q5_k_f32(
|
||||
kernel void kernel_mul_mat_q5_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
|
||||
device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = tpitg.y; // 0...16
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid - 4*il;// 0...3
|
||||
@@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||
|
||||
uchar2 sc1, sc2, sc3, sc4;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
|
||||
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||
@@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||
|
||||
}
|
||||
#else
|
||||
const int il = 4 * tpitg.x; // 0, 4, 8, 12
|
||||
const int im = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const uint8_t * h = x[i].qh + in;
|
||||
device const int8_t * s = x[i].scales;
|
||||
device const float * y = yy + i*QK_K + il;
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t hl = h[l] >> im;
|
||||
sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
|
||||
+ y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
|
||||
+ y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
|
||||
+ y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
sum[ith] = sumf;
|
||||
|
||||
//
|
||||
@@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q6_k_f32(
|
||||
kernel void kernel_mul_mat_q6_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
|
||||
device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
|
||||
const int iqs = 16 * tpitg.y;
|
||||
const int ip = iqs / 128; // 0 or 1
|
||||
@@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||
const int q_offset_l = 64*ip + l0;
|
||||
const int q_offset_h = 32*ip + l0;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
|
||||
device const uint8_t * ql = x[i].ql + q_offset_l;
|
||||
@@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
|
||||
|
||||
}
|
||||
#else
|
||||
const int il = 4*tpitg.x; // 0, 4, 8, 12
|
||||
|
||||
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
device const uint8_t * ql = x[i].ql + il;
|
||||
device const uint8_t * qh = x[i].qh + il;
|
||||
device const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
||||
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
||||
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
||||
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
||||
}
|
||||
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
sum[ith] = sumf;
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||
|
||||
#include "ggml.h"
|
||||
@@ -13386,7 +13387,8 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
const int nk1 = ne01;
|
||||
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
// round-up so it is more suitable for SIMD
|
||||
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
+1671
-16
File diff suppressed because it is too large
Load Diff
+43
-8
@@ -7,7 +7,13 @@
|
||||
#include <stddef.h>
|
||||
|
||||
// Super-block size
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#define K_SCALE_SIZE 4
|
||||
#else
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elemenets each
|
||||
// Effectively 3.4375 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||
uint8_t scales[2];
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[12]; // scales, quantized with 6 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 4-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_fp16_t d[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 5-bit quantization
|
||||
// 16 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||
ggml_fp16_t d; // super-block scale
|
||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // super-block scale for quantized scales
|
||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
|
||||
@@ -53,7 +53,6 @@ __attribute__((format(gnu_printf, 1, 2)))
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama-util.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -13,9 +21,13 @@
|
||||
#endif
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
#ifndef QK_K
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#else
|
||||
#define QK_K 256
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
@@ -2462,6 +2474,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
std::vector<std::thread> workers;
|
||||
std::mutex mutex;
|
||||
|
||||
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||
};
|
||||
|
||||
size_t idx = 0;
|
||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
||||
llama_buffer read_data;
|
||||
@@ -2516,15 +2532,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#define QK 32
|
||||
#define WARMUP 5
|
||||
#define ITERATIONS 10
|
||||
#define MAX_ITERATIONS 100000000
|
||||
|
||||
#define L1_SIZE 32*128
|
||||
#define L2_SIZE 32*2048
|
||||
@@ -36,9 +37,9 @@ struct quantize_perf_params {
|
||||
bool op_dequantize_row_q = false;
|
||||
bool op_quantize_row_q_dot = false;
|
||||
bool op_vec_dot_q = false;
|
||||
int64_t iterations = ITERATIONS;
|
||||
};
|
||||
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
@@ -75,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
|
||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||
}
|
||||
|
||||
void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
|
||||
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
|
||||
int64_t min_time_us = INT64_MAX;
|
||||
int64_t total_time_us = 0;
|
||||
int64_t min_time_cycles = INT64_MAX;
|
||||
@@ -86,7 +87,7 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
const int64_t start_time = ggml_time_us();
|
||||
const int64_t start_cycles = cpu_cycles();
|
||||
|
||||
@@ -102,9 +103,38 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||
}
|
||||
|
||||
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
|
||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS));
|
||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us));
|
||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
|
||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us));
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
||||
}
|
||||
|
||||
void usage(char * argv[]) {
|
||||
printf("Benchmark quantization specific functions on synthetic data\n");
|
||||
printf("\n");
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
printf("options: (default)\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
||||
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
||||
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
||||
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
||||
printf(" --type TYPE set test type as");
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
|
||||
if (ggml_type_name(type) != NULL) {
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
printf(" %s", ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(" (all)\n");
|
||||
printf(" --alignment-offset OFFSET\n");
|
||||
printf(" set alignment offset as OFFSET (0)\n");
|
||||
printf(" -i NUM, --iterations NUM\n");
|
||||
printf(" set test iteration number (%d)\n", ITERATIONS);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
@@ -178,6 +208,21 @@ int main(int argc, char * argv[]) {
|
||||
break;
|
||||
}
|
||||
params.alignment_offset = alignment;
|
||||
} else if ((arg == "-i") || (arg == "--iterations")) {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int number = std::stoi(argv[i]);
|
||||
if (number < 0 || number > MAX_ITERATIONS) {
|
||||
fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.iterations = number;
|
||||
} else if ((arg == "-h") || (arg == "--help")) {
|
||||
usage(argv);
|
||||
return 1;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
@@ -213,6 +258,8 @@ int main(int argc, char * argv[]) {
|
||||
generate_data(0, largest, test_data1);
|
||||
generate_data(1, largest, test_data2);
|
||||
|
||||
int64_t iterations = params.iterations;
|
||||
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
@@ -225,7 +272,7 @@ int main(int argc, char * argv[]) {
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -241,7 +288,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -255,7 +302,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -270,7 +317,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -284,7 +331,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -301,7 +348,7 @@ int main(int argc, char * argv[]) {
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user