feat: TurboQuant KV-cache quantization for AMD ROCm (turbo3/turbo4)

Implements TurboQuant (Zandieh et al., ICLR 2026) KV-cache vector
quantization targeting AMD RDNA 4 (gfx1201, RX 9070 XT).

Algorithm: L2-normalize → FWHT(128) → Lloyd-Max scalar quantize → bitpack
Decode: unpack → codebook lookup → inverse FWHT → denormalize

Two new GGML types:
- GGML_TYPE_TURBO3_0: 3-bit, 3.5 bpw, MSE*d=0.034 (block_size=32, 14 bytes)
- GGML_TYPE_TURBO4_0: 4-bit, 4.5 bpw, MSE*d=0.009 (block_size=32, 18 bytes)

Architecture (pre-dequantize strategy):
- Write path: FWHT-aware set-rows kernels (128 threads, shared-mem FWHT)
- Read path: bulk dequantize turbo→f16 before standard Flash Attention
- Stride scaling preserves ggml_permute dim swaps (critical fix)

Performance (Qwen3-14B Q4_K_M, RX 9070 XT, 16 GB VRAM):
  f16/f16:       1865 pp512,  54 tg128 (baseline)
  q8_0/q8_0:     1694 pp512,  52 tg128
  turbo4/turbo4:  1813 pp512,  49 tg128 (-3% pp, -9% tg, 72% less KV VRAM)
  turbo3/turbo3:  1983 pp512,  49 tg128 (+6% pp, -9% tg, 78% less KV VRAM)

Usage: llama-cli -fa 1 --cache-type-k turbo4 --cache-type-v turbo4

Includes 7 CPU reference tests validating FWHT self-inverse, MSE against
paper values, bitpack determinism, and dequantize sanity.

Requires head_dim=128 (covers most current models including Llama, Qwen,
Mistral, Gemma). Guard added to KV cache init with clear error message.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pascal Wachowski
2026-03-29 19:24:47 +02:00
parent 7c203670f8
commit bd571adc99
22 changed files with 1549 additions and 10 deletions
+2
View File
@@ -387,6 +387,8 @@ const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
GGML_TYPE_TURBO3_0,
GGML_TYPE_TURBO4_0,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
+3 -1
View File
@@ -428,7 +428,9 @@ extern "C" {
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
GGML_TYPE_COUNT = 41,
GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV-cache (3.5 bpw)
GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV-cache (4.5 bpw)
GGML_TYPE_COUNT = 43,
};
// precision
+20
View File
@@ -205,6 +205,26 @@ typedef struct {
} block_nvfp4;
static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
// TurboQuant 3-bit KV-cache quantization (3.5 bpw)
#define TURBO3_BLOCK_SIZE 32
#define QK_TURBO3 32
#define QR_TURBO3 2
typedef struct {
ggml_half d; // FP16 L2-norm
uint8_t qs[12]; // 32 x 3-bit packed indices
} block_turbo3_0;
static_assert(sizeof(block_turbo3_0) == 14, "wrong turbo3 block size");
// TurboQuant 4-bit KV-cache quantization (4.5 bpw)
#define TURBO4_BLOCK_SIZE 32
#define QK_TURBO4 32
#define QR_TURBO4 2
typedef struct {
ggml_half d; // FP16 L2-norm
uint8_t qs[16]; // 32 x 4-bit packed indices
} block_turbo4_0;
static_assert(sizeof(block_turbo4_0) == 18, "wrong turbo4 block size");
#define QK5_0 32
typedef struct {
ggml_half d; // delta
+3 -1
View File
@@ -120,7 +120,9 @@ if (CUDAToolkit_FOUND)
template-instances/fattn-vec-instance-f16-f16.cu
template-instances/fattn-vec-instance-q4_0-q4_0.cu
template-instances/fattn-vec-instance-q8_0-q8_0.cu
template-instances/fattn-vec-instance-bf16-bf16.cu)
template-instances/fattn-vec-instance-bf16-bf16.cu
template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu)
endif()
ggml_add_backend_library(ggml-cuda
+154
View File
@@ -656,6 +656,140 @@ static void dequantize_row_nvfp4_cuda(
const int nb = k / QK_NVFP4;
dequantize_block_nvfp4<<<nb, 32, 0, stream>>>(vx, y, k);
}
// ============================================================
// TurboQuant GPU bulk dequantize kernels (with FWHT)
// ============================================================
// Each CUDA block processes one 128-element chunk (= 4 turbo blocks).
// 128 threads per block, one thread per element.
// Step 1: unpack index + centroid lookup -> shared memory
// Step 2: FWHT butterfly in shared memory (7 stages for n=128)
// Step 3: normalize by 1/sqrt(128) and scale by stored norm
// Step 4: write to output
#define TURBO_HEAD_DIM_GPU 128
#define TURBO_BLOCKS_PER_CHUNK_GPU (TURBO_HEAD_DIM_GPU / 32) // 4
template <typename dst_t>
static __global__ void dequantize_block_turbo3_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
__shared__ float smem[TURBO_HEAD_DIM_GPU];
const int64_t chunk_idx = blockIdx.x;
const int tid = threadIdx.x; // 0..127
const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU;
if (output_offset + tid >= k) return;
// Which of the 4 blocks within this chunk does this thread belong to?
const int local_block = tid / TURBO3_BLOCK_SIZE; // 0..3
const int elem_in_block = tid % TURBO3_BLOCK_SIZE; // 0..31
const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block;
// Unpack 3-bit index and look up centroid
const block_turbo3_0 * x = (const block_turbo3_0 *)vx + global_block_idx;
const uint8_t * qs = x->qs;
int bit_off = elem_in_block * 3;
int byte_idx = bit_off / 8;
int shift = bit_off % 8;
uint16_t raw = (uint16_t)qs[byte_idx] >> shift;
if (shift > 5 && byte_idx + 1 < 12)
raw |= (uint16_t)qs[byte_idx + 1] << (8 - shift);
uint8_t idx = (uint8_t)(raw & 0x07);
smem[tid] = dc_codebook_3bit[idx];
__syncthreads();
// FWHT butterfly stages (7 stages for n=128)
for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
if (tid < 64) { // 128/2 = 64 butterflies per stage
int group = tid / h;
int pos = tid % h;
int i = group * h * 2 + pos;
float a = smem[i];
float b = smem[i + h];
smem[i] = a + b;
smem[i + h] = a - b;
}
__syncthreads();
}
// Normalize by 1/sqrt(128) and scale by stored norm
const float fwht_scale = 0.08838834764831844f; // 1/sqrt(128)
const block_turbo3_0 * first_block = (const block_turbo3_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU;
float norm = __half2float(first_block->d);
smem[tid] *= fwht_scale * norm;
__syncthreads();
// Write to output
y[output_offset + tid] = ggml_cuda_cast<dst_t>(smem[tid]);
}
template <typename dst_t>
static void dequantize_row_turbo3_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0);
const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU);
dequantize_block_turbo3_0_kernel<<<num_chunks, TURBO_HEAD_DIM_GPU, 0, stream>>>(vx, y, k);
}
template <typename dst_t>
static __global__ void dequantize_block_turbo4_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
__shared__ float smem[TURBO_HEAD_DIM_GPU];
const int64_t chunk_idx = blockIdx.x;
const int tid = threadIdx.x; // 0..127
const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU;
if (output_offset + tid >= k) return;
// Which of the 4 blocks within this chunk does this thread belong to?
const int local_block = tid / TURBO4_BLOCK_SIZE; // 0..3
const int elem_in_block = tid % TURBO4_BLOCK_SIZE; // 0..31
const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block;
// Unpack 4-bit index and look up centroid
const block_turbo4_0 * x = (const block_turbo4_0 *)vx + global_block_idx;
int pair_idx = elem_in_block / 2;
uint8_t packed = x->qs[pair_idx];
uint8_t idx = (elem_in_block & 1) ? ((packed >> 4) & 0x0F) : (packed & 0x0F);
smem[tid] = dc_codebook_4bit[idx];
__syncthreads();
// FWHT butterfly stages (7 stages for n=128)
for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
if (tid < 64) { // 128/2 = 64 butterflies per stage
int group = tid / h;
int pos = tid % h;
int i = group * h * 2 + pos;
float a = smem[i];
float b = smem[i + h];
smem[i] = a + b;
smem[i + h] = a - b;
}
__syncthreads();
}
// Normalize by 1/sqrt(128) and scale by stored norm
const float fwht_scale = 0.08838834764831844f; // 1/sqrt(128)
const block_turbo4_0 * first_block = (const block_turbo4_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU;
float norm = __half2float(first_block->d);
smem[tid] *= fwht_scale * norm;
__syncthreads();
// Write to output
y[output_offset + tid] = ggml_cuda_cast<dst_t>(smem[tid]);
}
template <typename dst_t>
static void dequantize_row_turbo4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0);
const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU);
dequantize_block_turbo4_0_kernel<<<num_chunks, TURBO_HEAD_DIM_GPU, 0, stream>>>(vx, y, k);
}
template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
@@ -756,6 +890,10 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_NVFP4:
return dequantize_row_nvfp4_cuda;
case GGML_TYPE_TURBO3_0:
return dequantize_row_turbo3_0_cuda;
case GGML_TYPE_TURBO4_0:
return dequantize_row_turbo4_0_cuda;
case GGML_TYPE_F32:
return convert_unary_cont_cuda<float>;
case GGML_TYPE_BF16:
@@ -809,6 +947,10 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_NVFP4:
return dequantize_row_nvfp4_cuda;
case GGML_TYPE_TURBO3_0:
return dequantize_row_turbo3_0_cuda;
case GGML_TYPE_TURBO4_0:
return dequantize_row_turbo4_0_cuda;
case GGML_TYPE_F16:
return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
@@ -832,6 +974,10 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
case GGML_TYPE_TURBO3_0:
return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
case GGML_TYPE_TURBO4_0:
return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>;
default:
@@ -853,6 +999,10 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
case GGML_TYPE_TURBO3_0:
return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
case GGML_TYPE_TURBO4_0:
return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
case GGML_TYPE_F16:
return convert_unary_cuda<half, nv_bfloat16>;
default:
@@ -874,6 +1024,10 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
case GGML_TYPE_TURBO3_0:
return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
case GGML_TYPE_TURBO4_0:
return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16, float>;
default:
+87
View File
@@ -211,6 +211,93 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
}
// ============================================================
// TurboQuant GPU quantize device functions
// ============================================================
// Device-side codebook references (same values as in dequantize.cuh)
// These are declared extern to reference the __constant__ arrays from dequantize.cuh
// Note: we re-declare small local codebooks here to avoid linkage issues.
__device__ static const float tq_codebook_3bit_q[8] = {
-0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f
};
__device__ static const float tq_codebook_4bit_q[16] = {
-0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
-0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f,
0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f
};
static __device__ uint8_t tq_nearest_codebook(float val, const float *codebook, int n) {
float best_dist = fabsf(val - codebook[0]);
uint8_t best_idx = 0;
for (int i = 1; i < n; i++) {
float dist = fabsf(val - codebook[i]);
if (dist < best_dist) {
best_dist = dist;
best_idx = (uint8_t)i;
}
}
return best_idx;
}
static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ x, block_turbo3_0 * __restrict__ y) {
// Compute block norm
float sum_sq = 0.0f;
for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) {
sum_sq += x[j] * x[j];
}
float norm = sqrtf(sum_sq);
y->d = __float2half(norm);
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
// Quantize NORMALIZED elements to nearest 3-bit codebook entry and pack
uint8_t indices[32];
for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) {
indices[j] = tq_nearest_codebook(x[j] * inv_norm, tq_codebook_3bit_q, 8);
}
// Pack 32 x 3-bit values into 12 bytes
memset(y->qs, 0, 12);
for (int j = 0; j < 32; j++) {
int bit_off = j * 3;
int byte_idx = bit_off / 8;
int shift = bit_off % 8;
y->qs[byte_idx] |= (uint8_t)((indices[j] & 0x07) << shift);
if (shift > 5 && byte_idx + 1 < 12) {
y->qs[byte_idx + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift));
}
}
}
static __device__ void quantize_f32_turbo4_0_block(const float * __restrict__ x, block_turbo4_0 * __restrict__ y) {
// Compute block norm
float sum_sq = 0.0f;
for (int j = 0; j < TURBO4_BLOCK_SIZE; j++) {
sum_sq += x[j] * x[j];
}
float norm = sqrtf(sum_sq);
y->d = __float2half(norm);
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
// Quantize NORMALIZED elements to nearest 4-bit codebook entry and pack
for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) {
uint8_t idx0 = tq_nearest_codebook(x[2*j] * inv_norm, tq_codebook_4bit_q, 16);
uint8_t idx1 = tq_nearest_codebook(x[2*j + 1] * inv_norm, tq_codebook_4bit_q, 16);
y->qs[j] = (idx0 & 0x0F) | ((idx1 & 0x0F) << 4);
}
}
static __device__ void cpy_blck_f32_turbo3_0(const char * cxi, char * cdsti) {
quantize_f32_turbo3_0_block((const float *)cxi, (block_turbo3_0 *)cdsti);
}
static __device__ void cpy_blck_f32_turbo4_0(const char * cxi, char * cdsti) {
quantize_f32_turbo4_0_block((const float *)cxi, (block_turbo4_0 *)cdsti);
}
template<typename src_t, typename dst_t>
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
+64
View File
@@ -75,3 +75,67 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
v.x *= d;
v.y *= d;
}
// ============================================================
// TurboQuant GPU dequantize device functions
// ============================================================
__device__ __constant__ static float dc_codebook_3bit[8] = {
-0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f
};
__device__ __constant__ static float dc_codebook_4bit[16] = {
-0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
-0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f,
0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f
};
static __device__ __forceinline__ void dequantize_turbo3_0(
const void * vx, const int64_t ib, const int iqs, float2 & v)
{
const block_turbo3_0 * x = (const block_turbo3_0 *) vx + ib;
const uint8_t * qs = x->qs;
// Unpack two consecutive 3-bit indices
int elem0 = iqs * 2;
int elem1 = iqs * 2 + 1;
// Extract 3-bit value for elem0
int bit_off0 = elem0 * 3;
int byte0 = bit_off0 / 8;
int shift0 = bit_off0 % 8;
uint16_t raw0 = (uint16_t)qs[byte0] >> shift0;
if (shift0 > 5 && byte0 + 1 < 12)
raw0 |= (uint16_t)qs[byte0 + 1] << (8 - shift0);
uint8_t idx0 = (uint8_t)(raw0 & 0x07);
// Extract 3-bit value for elem1
int bit_off1 = elem1 * 3;
int byte1 = bit_off1 / 8;
int shift1 = bit_off1 % 8;
uint16_t raw1 = (uint16_t)qs[byte1] >> shift1;
if (shift1 > 5 && byte1 + 1 < 12)
raw1 |= (uint16_t)qs[byte1 + 1] << (8 - shift1);
uint8_t idx1 = (uint8_t)(raw1 & 0x07);
const float norm = __half2float(x->d);
v.x = dc_codebook_3bit[idx0] * norm;
v.y = dc_codebook_3bit[idx1] * norm;
}
static __device__ __forceinline__ void dequantize_turbo4_0(
const void * vx, const int64_t ib, const int iqs, float2 & v)
{
const block_turbo4_0 * x = (const block_turbo4_0 *) vx + ib;
// 4-bit: 2 values per byte, simple nibble extraction
uint8_t packed = x->qs[iqs];
uint8_t idx0 = packed & 0x0F;
uint8_t idx1 = (packed >> 4) & 0x0F;
const float norm = __half2float(x->d);
v.x = dc_codebook_4bit[idx0] * norm;
v.y = dc_codebook_4bit[idx1] * norm;
}
+64
View File
@@ -3,6 +3,7 @@
#include "common.cuh"
#include "convert.cuh"
#include "vecdotq.cuh"
#include "dequantize.cuh"
#include <cstdint>
@@ -577,6 +578,59 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
}
}
// ============================================================
// TurboQuant V-cache dequantize functions for flash attention
// ============================================================
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
const int64_t ib = i0 / QK_TURBO3;
const int iqs = (int)(i0 % QK_TURBO3) / 2;
static_assert(ne % 2 == 0, "bad ne");
T * dst_t = (T *) dst;
#pragma unroll
for (int l = 0; l < ne/2; ++l) {
float2 v;
dequantize_turbo3_0(vx, ib, iqs + l, v);
if constexpr (std::is_same_v<T, half>) {
dst_t[2*l + 0] = __float2half(v.x);
dst_t[2*l + 1] = __float2half(v.y);
} else {
dst_t[2*l + 0] = (T)v.x;
dst_t[2*l + 1] = (T)v.y;
}
}
}
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_turbo4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
const int64_t ib = i0 / QK_TURBO4;
const int iqs = (int)(i0 % QK_TURBO4) / 2;
static_assert(ne % 2 == 0, "bad ne");
T * dst_t = (T *) dst;
#pragma unroll
for (int l = 0; l < ne/2; ++l) {
float2 v;
dequantize_turbo4_0(vx, ib, iqs + l, v);
if constexpr (std::is_same_v<T, half>) {
dst_t[2*l + 0] = __float2half(v.x);
dst_t[2*l + 1] = __float2half(v.y);
} else {
dst_t[2*l + 0] = (T)v.x;
dst_t[2*l + 1] = (T)v.y;
}
}
GGML_UNUSED(x);
}
template <ggml_type type_K, int D, int nthreads>
constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
if constexpr (type_K == GGML_TYPE_F16) {
@@ -593,6 +647,12 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_BF16) {
return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
// TurboQuant K-cache: Phase 1 - dequantize to FP16 before attention
// (use FP16 dot product after conversion in the dispatch layer)
} else if constexpr (type_K == GGML_TYPE_TURBO3_0) {
return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_TURBO4_0) {
return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
} else {
static_assert(type_K == -1, "bad type");
return nullptr;
@@ -615,6 +675,10 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
return dequantize_V_q8_0<T, ne>;
} else if constexpr (type_V == GGML_TYPE_BF16) {
return dequantize_V_bf16<float, ne>;
} else if constexpr (type_V == GGML_TYPE_TURBO3_0) {
return dequantize_V_turbo3_0<T, ne>;
} else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
return dequantize_V_turbo4_0<T, ne>;
} else {
static_assert(type_V == -1, "bad type");
return nullptr;
+16 -4
View File
@@ -75,17 +75,20 @@ static __global__ void flash_attn_ext_vec(
#endif // GGML_USE_HIP
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
constexpr bool K_is_fp_like = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16);
constexpr bool V_is_fp_like = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16);
constexpr int nthreads_KQ = K_is_fp_like ? 128 / cpy_nb : nthreads_KQ_q;
constexpr int nthreads_V = V_is_fp_like ? 128 / cpy_nb : nthreads_V_q;
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
constexpr int V_rows_per_thread = V_is_fp_like ? 2*cpy_ne : 4;
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
constexpr bool Q_q8_1 = !K_is_fp_like;
#ifdef V_DOT2_F32_F16_AVAILABLE
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
#else
@@ -598,3 +601,12 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)
// TurboQuant extern declarations (homogeneous K/V only)
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+86 -2
View File
@@ -5,6 +5,7 @@
#include "fattn-vec.cuh"
#include "fattn-wmma-f16.cuh"
#include "fattn.cuh"
#include "convert.cuh"
template <int DKQ, int DV, int ncols2>
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -273,6 +274,8 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
// TurboQuant: pre-dequantized to f16 before FA, no turbo vec cases needed
#else
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
@@ -292,6 +295,10 @@ enum best_fattn_kernel {
BEST_FATTN_KERNEL_MMA_F16 = 400,
};
static bool ggml_type_is_turbo(ggml_type type) {
return type == GGML_TYPE_TURBO3_0 || type == GGML_TYPE_TURBO4_0;
}
static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ggml_tensor * dst) {
#ifndef FLASH_ATTN_AVAILABLE
GGML_UNUSED(device); GGML_UNUSED(dst);
@@ -353,8 +360,13 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
#ifndef GGML_CUDA_FA_ALL_QUANTS
if (K->type != V->type) {
return BEST_FATTN_KERNEL_NONE;
{
// Turbo types are pre-dequantized to f16, so treat them as f16 for type matching
const ggml_type eff_k = ggml_type_is_turbo(K->type) ? GGML_TYPE_F16 : K->type;
const ggml_type eff_v = ggml_type_is_turbo(V->type) ? GGML_TYPE_F16 : V->type;
if (eff_k != eff_v) {
return BEST_FATTN_KERNEL_NONE;
}
}
#endif // GGML_CUDA_FA_ALL_QUANTS
@@ -372,6 +384,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
case GGML_TYPE_Q8_0:
case GGML_TYPE_BF16:
break;
case GGML_TYPE_TURBO3_0:
case GGML_TYPE_TURBO4_0:
// Turbo types are handled via pre-dequantize to f16 before FA
break;
default:
return BEST_FATTN_KERNEL_NONE;
}
@@ -485,8 +501,72 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_TILE;
}
// Pre-dequantize a turbo tensor to f16, returning a stack-allocated tensor copy.
// The caller must keep pool_buf alive until after FA completes.
static ggml_tensor turbo_pre_dequantize(
const ggml_tensor * src,
ggml_cuda_pool_alloc<half> & pool_buf,
cudaStream_t stream) {
const int64_t n_elements = ggml_nelements(src);
pool_buf.alloc(n_elements);
to_fp16_cuda_t dequant = ggml_get_to_fp16_cuda(src->type);
GGML_ASSERT(dequant != nullptr);
dequant(src->data, pool_buf.ptr, n_elements, stream);
// Scale existing strides from turbo block layout to f16 element layout.
// This preserves any permutation (e.g. ggml_permute swapping dims 1 and 2).
// The dequantized f16 data is in the same physical order as the turbo data,
// so the stride relationships must be preserved, just rescaled.
const size_t bs = ggml_blck_size(src->type);
const size_t ts = ggml_type_size(src->type);
ggml_tensor tmp = *src;
tmp.type = GGML_TYPE_F16;
tmp.data = pool_buf.ptr;
tmp.nb[0] = sizeof(half);
tmp.nb[1] = src->nb[1] * bs * sizeof(half) / ts;
tmp.nb[2] = src->nb[2] * bs * sizeof(half) / ts;
tmp.nb[3] = src->nb[3] * bs * sizeof(half) / ts;
tmp.view_src = nullptr;
tmp.view_offs = 0;
return tmp;
}
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_set_device(ctx.device);
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const bool k_is_turbo = ggml_type_is_turbo(K->type);
const bool v_is_turbo = V && ggml_type_is_turbo(V->type);
// Pre-dequantize turbo KV to f16 so standard FA kernels can handle them.
// Pool buffers must outlive the FA dispatch (RAII frees on scope exit).
ggml_cuda_pool_alloc<half> k_pool(ctx.pool());
ggml_cuda_pool_alloc<half> v_pool(ctx.pool());
ggml_tensor k_f16, v_f16;
cudaStream_t stream = ctx.stream();
// Save original src pointers
ggml_tensor * orig_k = dst->src[1];
ggml_tensor * orig_v = dst->src[2];
if (k_is_turbo) {
k_f16 = turbo_pre_dequantize(K, k_pool, stream);
dst->src[1] = &k_f16;
}
if (v_is_turbo) {
v_f16 = turbo_pre_dequantize(V, v_pool, stream);
dst->src[2] = &v_f16;
}
// Standard FA dispatch — now sees f16 tensors
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
case BEST_FATTN_KERNEL_NONE:
GGML_ABORT("fatal error");
@@ -503,6 +583,10 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
break;
}
// Restore original src pointers
dst->src[1] = orig_k;
dst->src[2] = orig_v;
}
bool ggml_cuda_flash_attn_ext_supported(int device, const ggml_tensor * dst) {
+2 -1
View File
@@ -4842,7 +4842,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
{
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO4_0) &&
op->src[0]->type == GGML_TYPE_F32 &&
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
+422
View File
@@ -1,8 +1,16 @@
#include "set-rows.cuh"
#include "cpy-utils.cuh"
#include <cmath>
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
// ============================================================
// TurboQuant constants for set-rows FWHT
// ============================================================
#define TURBO_HEAD_DIM_SR 128
#define TURBO_BLOCKS_PER_CHUNK_SR (TURBO_HEAD_DIM_SR / 32) // 4
// Generic quantized set_rows kernel template
template <typename idx_t, typename block_type, int qk, void (*quantize_func)(const float *, block_type *)>
static __global__ void k_set_rows_quant(const float * __restrict__ src0,
@@ -109,6 +117,398 @@ static void set_rows_cuda_quant(
}
}
// ============================================================
// TurboQuant specialized set-rows kernel with FWHT
// ============================================================
// Each CUDA block processes one 128-element chunk.
// 128 threads per block, one thread per element in the chunk.
// Steps:
// 1. Each thread reads one float from the source row
// 2. Cooperative norm computation via shared memory reduction
// 3. Normalize the chunk
// 4. FWHT butterfly in shared memory (7 stages for n=128)
// 5. Each thread scalar-quantizes its element and packs into blocks
// Device-side codebook references for turbo quantize (same as in cpy-utils.cuh)
__device__ static const float sr_codebook_3bit[8] = {
-0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f
};
__device__ static const float sr_codebook_4bit[16] = {
-0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
-0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f,
0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f
};
static __device__ uint8_t sr_nearest_codebook(float val, const float *codebook, int n) {
float best_dist = fabsf(val - codebook[0]);
uint8_t best_idx = 0;
for (int i = 1; i < n; i++) {
float dist = fabsf(val - codebook[i]);
if (dist < best_dist) {
best_dist = dist;
best_idx = (uint8_t)i;
}
}
return best_idx;
}
// Turbo3 set-rows kernel: processes 128-element chunks with FWHT
template <typename idx_t>
static __global__ void k_set_rows_turbo3(
const float * __restrict__ src0,
const idx_t * __restrict__ src1,
block_turbo3_0 * __restrict__ dst,
const int64_t ne_total_chunks,
const int64_t ne10,
const int64_t ne11,
const int64_t ne12,
const int64_t ne13,
const int64_t s01,
const int64_t s02,
const int64_t s03,
const int64_t s10,
const int64_t s11,
const int64_t s12,
const int64_t s1,
const int64_t s2,
const int64_t s3,
const int64_t ne00,
const uint3 ne00_fd,
const uint3 ne01_fd,
const uint3 ne02_fd,
const uint3 ne11_fd,
const uint3 ne12_fd) {
__shared__ float smem[TURBO_HEAD_DIM_SR];
__shared__ float reduction[TURBO_HEAD_DIM_SR];
const int64_t chunk_global = blockIdx.x;
const int tid = threadIdx.x; // 0..127
if (chunk_global >= ne_total_chunks) return;
// Map the global chunk index to i00 (element offset within a row) + row indices
// Each chunk covers 128 elements, so the chunk's base element = chunk_global * 128
const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR;
uint32_t tmp = (uint32_t)elem_base;
uint2 div_mod;
div_mod = fast_div_modulo(tmp, ne00_fd);
const int64_t i00 = div_mod.y; // offset within row (multiple of 128)
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne01_fd);
const int64_t i01 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne02_fd);
const int64_t i02 = div_mod.y;
const int64_t i03 = div_mod.x;
const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd);
const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd);
const int64_t i10 = i01;
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
const float val = src0_row[i00 + tid];
smem[tid] = val;
// Step 1: Compute L2 norm via parallel reduction
reduction[tid] = val * val;
__syncthreads();
for (int s = 64; s > 0; s >>= 1) {
if (tid < s) {
reduction[tid] += reduction[tid + s];
}
__syncthreads();
}
float norm = sqrtf(reduction[0]);
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
// Step 2: Normalize
smem[tid] *= inv_norm;
__syncthreads();
// Step 3: FWHT butterfly stages (7 stages for n=128)
for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
if (tid < 64) {
int group = tid / h;
int pos = tid % h;
int i = group * h * 2 + pos;
float a = smem[i];
float b = smem[i + h];
smem[i] = a + b;
smem[i + h] = a - b;
}
__syncthreads();
}
// Apply 1/sqrt(128) normalization
const float fwht_scale = 0.08838834764831844f;
smem[tid] *= fwht_scale;
__syncthreads();
// Step 4: Scalar quantize and pack into turbo3 blocks
// Each thread quantizes its element
uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_3bit, 8);
// We need to pack 32 indices per block cooperatively
// Use shared memory to collect indices, then pack
// Reuse reduction[] as uint8 storage
((uint8_t *)reduction)[tid] = my_idx;
__syncthreads();
// Compute destination block pointer
// dst layout: dst_row*s1 + i02*s2 + i03*s3 gives byte offset to row start
// Then add block offset for i00
block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
const int64_t dst_block_base = i00 / TURBO3_BLOCK_SIZE;
// Only 4 threads (one per block) do the packing
if (tid < TURBO_BLOCKS_PER_CHUNK_SR) {
const int blk = tid;
block_turbo3_0 * dst_block = dst_row_ptr + dst_block_base + blk;
const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32;
// Store norm
dst_block->d = __float2half(norm);
// Pack 32 x 3-bit indices into 12 bytes
memset(dst_block->qs, 0, 12);
for (int j = 0; j < 32; j++) {
int bit_off = j * 3;
int byte_pos = bit_off / 8;
int shift = bit_off % 8;
dst_block->qs[byte_pos] |= (uint8_t)((indices[j] & 0x07) << shift);
if (shift > 5 && byte_pos + 1 < 12) {
dst_block->qs[byte_pos + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift));
}
}
}
GGML_UNUSED(ne10);
GGML_UNUSED(ne11);
GGML_UNUSED(ne12);
GGML_UNUSED(ne13);
}
// Turbo4 set-rows kernel: processes 128-element chunks with FWHT
template <typename idx_t>
static __global__ void k_set_rows_turbo4(
const float * __restrict__ src0,
const idx_t * __restrict__ src1,
block_turbo4_0 * __restrict__ dst,
const int64_t ne_total_chunks,
const int64_t ne10,
const int64_t ne11,
const int64_t ne12,
const int64_t ne13,
const int64_t s01,
const int64_t s02,
const int64_t s03,
const int64_t s10,
const int64_t s11,
const int64_t s12,
const int64_t s1,
const int64_t s2,
const int64_t s3,
const int64_t ne00,
const uint3 ne00_fd,
const uint3 ne01_fd,
const uint3 ne02_fd,
const uint3 ne11_fd,
const uint3 ne12_fd) {
__shared__ float smem[TURBO_HEAD_DIM_SR];
__shared__ float reduction[TURBO_HEAD_DIM_SR];
const int64_t chunk_global = blockIdx.x;
const int tid = threadIdx.x; // 0..127
if (chunk_global >= ne_total_chunks) return;
// Map the global chunk index to i00 (element offset within a row) + row indices
const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR;
uint32_t tmp = (uint32_t)elem_base;
uint2 div_mod;
div_mod = fast_div_modulo(tmp, ne00_fd);
const int64_t i00 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne01_fd);
const int64_t i01 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne02_fd);
const int64_t i02 = div_mod.y;
const int64_t i03 = div_mod.x;
const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd);
const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd);
const int64_t i10 = i01;
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
const float val = src0_row[i00 + tid];
smem[tid] = val;
// Step 1: Compute L2 norm via parallel reduction
reduction[tid] = val * val;
__syncthreads();
for (int s = 64; s > 0; s >>= 1) {
if (tid < s) {
reduction[tid] += reduction[tid + s];
}
__syncthreads();
}
float norm = sqrtf(reduction[0]);
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
// Step 2: Normalize
smem[tid] *= inv_norm;
__syncthreads();
// Step 3: FWHT butterfly stages (7 stages for n=128)
for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
if (tid < 64) {
int group = tid / h;
int pos = tid % h;
int i = group * h * 2 + pos;
float a = smem[i];
float b = smem[i + h];
smem[i] = a + b;
smem[i + h] = a - b;
}
__syncthreads();
}
// Apply 1/sqrt(128) normalization
const float fwht_scale = 0.08838834764831844f;
smem[tid] *= fwht_scale;
__syncthreads();
// Step 4: Scalar quantize and pack into turbo4 blocks
uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_4bit, 16);
// Collect indices in shared memory
((uint8_t *)reduction)[tid] = my_idx;
__syncthreads();
// Compute destination block pointer
block_turbo4_0 * dst_row_ptr = (block_turbo4_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
const int64_t dst_block_base = i00 / TURBO4_BLOCK_SIZE;
// Only 4 threads (one per block) do the packing
if (tid < TURBO_BLOCKS_PER_CHUNK_SR) {
const int blk = tid;
block_turbo4_0 * dst_block = dst_row_ptr + dst_block_base + blk;
const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32;
// Store norm
dst_block->d = __float2half(norm);
// Pack 32 x 4-bit indices into 16 bytes
for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) {
dst_block->qs[j] = (indices[2*j] & 0x0F) | ((indices[2*j + 1] & 0x0F) << 4);
}
}
GGML_UNUSED(ne10);
GGML_UNUSED(ne11);
GGML_UNUSED(ne12);
GGML_UNUSED(ne13);
}
// Dispatch functions for turbo set-rows
template<typename idx_t>
static void set_rows_cuda_turbo3(
const float * src0_d, const idx_t * src1_d, block_turbo3_0 * dst_d,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0);
const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR;
const dim3 grid_size((int)ne_total_chunks);
const dim3 block_size(TURBO_HEAD_DIM_SR);
const int64_t s01 = nb01/sizeof(float);
const int64_t s02 = nb02/sizeof(float);
const int64_t s03 = nb03/sizeof(float);
const int64_t s10 = nb10/sizeof(idx_t);
const int64_t s11 = nb11/sizeof(idx_t);
const int64_t s12 = nb12/sizeof(idx_t);
const int64_t s1 = nb1;
const int64_t s2 = nb2;
const int64_t s3 = nb3;
if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
k_set_rows_turbo3<idx_t><<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13,
s01, s02, s03, s10, s11, s12, s1, s2, s3,
ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd);
}
}
template<typename idx_t>
static void set_rows_cuda_turbo4(
const float * src0_d, const idx_t * src1_d, block_turbo4_0 * dst_d,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0);
const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR;
const dim3 grid_size((int)ne_total_chunks);
const dim3 block_size(TURBO_HEAD_DIM_SR);
const int64_t s01 = nb01/sizeof(float);
const int64_t s02 = nb02/sizeof(float);
const int64_t s03 = nb03/sizeof(float);
const int64_t s10 = nb10/sizeof(idx_t);
const int64_t s11 = nb11/sizeof(idx_t);
const int64_t s12 = nb12/sizeof(idx_t);
const int64_t s1 = nb1;
const int64_t s2 = nb2;
const int64_t s3 = nb3;
if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
k_set_rows_turbo4<idx_t><<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13,
s01, s02, s03, s10, s11, s12, s1, s2, s3,
ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd);
}
}
template <typename src_t, typename idx_t, typename dst_t>
static __global__ void k_set_rows(const src_t * __restrict__ src0,
const idx_t * __restrict__ src1,
@@ -309,6 +709,28 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
nb1, nb2, nb3,
stream
);
} else if (dst->type == GGML_TYPE_TURBO3_0) {
// FWHT-aware 128-thread kernels for correct TurboQuant encoding
set_rows_cuda_turbo3<idx_t>(
src0_d, src1_d, (block_turbo3_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else if (dst->type == GGML_TYPE_TURBO4_0) {
// FWHT-aware 128-thread kernels for correct TurboQuant encoding
set_rows_cuda_turbo4<idx_t>(
src0_d, src1_d, (block_turbo4_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else {
GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
}
@@ -0,0 +1,7 @@
// TurboQuant flash attention template instances
#include "../fattn-vec.cuh"
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
// TurboQuant flash attention template instances
#include "../fattn-vec.cuh"
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+3 -1
View File
@@ -75,7 +75,9 @@ else()
../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu)
endif()
ggml_add_backend_library(ggml-hip
+344
View File
@@ -494,6 +494,342 @@ void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_REST
}
}
// ============================================================
// TurboQuant codebook data (Lloyd-Max optimal for d=128 after WHT)
// ============================================================
static const float turbo_codebook_3bit[8] = {
-0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f
};
static const float turbo_codebook_4bit[16] = {
-0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
-0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f,
0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f
};
// ============================================================
// TurboQuant helper: pack/unpack bit-packed indices
// ============================================================
static void turbo_pack3(const uint8_t *indices, uint8_t *out) {
// Pack 32 x 3-bit values into 12 bytes (96 bits)
memset(out, 0, 12);
for (int i = 0; i < 32; i++) {
int bit_off = i * 3;
int byte_idx = bit_off / 8;
int shift = bit_off % 8;
out[byte_idx] |= (uint8_t)((indices[i] & 0x07) << shift);
if (shift > 5 && byte_idx + 1 < 12) {
out[byte_idx + 1] |= (uint8_t)((indices[i] & 0x07) >> (8 - shift));
}
}
}
static void turbo_unpack3(const uint8_t *packed, uint8_t *indices) {
// Unpack 12 bytes into 32 x 3-bit values
for (int i = 0; i < 32; i++) {
int bit_off = i * 3;
int byte_idx = bit_off / 8;
int shift = bit_off % 8;
uint16_t raw = (uint16_t)packed[byte_idx] >> shift;
if (shift > 5 && byte_idx + 1 < 12) {
raw |= (uint16_t)packed[byte_idx + 1] << (8 - shift);
}
indices[i] = (uint8_t)(raw & 0x07);
}
}
static void turbo_pack4(const uint8_t *indices, uint8_t *out) {
// Pack 32 x 4-bit values into 16 bytes
for (int i = 0; i < 16; i++) {
out[i] = (indices[2*i] & 0x0F) | ((indices[2*i + 1] & 0x0F) << 4);
}
}
static void turbo_unpack4(const uint8_t *packed, uint8_t *indices) {
// Unpack 16 bytes into 32 x 4-bit values
for (int i = 0; i < 16; i++) {
indices[2*i] = packed[i] & 0x0F;
indices[2*i + 1] = (packed[i] >> 4) & 0x0F;
}
}
static uint8_t turbo_quantize_scalar(float val, const float *codebook, int n_codes) {
// Find nearest codebook entry (linear scan - codebook is sorted)
float best_dist = fabsf(val - codebook[0]);
uint8_t best_idx = 0;
for (int i = 1; i < n_codes; i++) {
float dist = fabsf(val - codebook[i]);
if (dist < best_dist) {
best_dist = dist;
best_idx = (uint8_t)i;
}
}
return best_idx;
}
// ============================================================
// TurboQuant FWHT (Fast Walsh-Hadamard Transform)
// Self-inverse with 1/sqrt(n) normalization.
// ============================================================
#define TURBO_HEAD_DIM 128
#define TURBO_BLOCKS_PER_CHUNK (TURBO_HEAD_DIM / 32) // 4 blocks of 32 = 128
static void turbo_fwht_f32(float *x, int n) {
// Butterfly sums
for (int h = 1; h < n; h *= 2) {
for (int i = 0; i < n; i += h * 2) {
for (int j = i; j < i + h; j++) {
float a = x[j];
float b = x[j + h];
x[j] = a + b;
x[j + h] = a - b;
}
}
}
// Normalize by 1/sqrt(n)
float scale = 1.0f / sqrtf((float)n);
for (int i = 0; i < n; i++) {
x[i] *= scale;
}
}
// ============================================================
// TurboQuant TURBO3_0 (3-bit, 3.5 bpw)
// ============================================================
void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT src, block_turbo3_0 * GGML_RESTRICT dst, int64_t k) {
assert(k % TURBO3_BLOCK_SIZE == 0);
assert(k % TURBO_HEAD_DIM == 0);
float tmp[TURBO_HEAD_DIM];
int64_t blocks_done = 0;
for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
// Step 1: Compute L2 norm of this head_dim chunk
float sum_sq = 0.0f;
for (int i = 0; i < TURBO_HEAD_DIM; i++) {
sum_sq += src[offset + i] * src[offset + i];
}
float norm = sqrtf(sum_sq);
// Step 2: Normalize the chunk
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
for (int i = 0; i < TURBO_HEAD_DIM; i++) {
tmp[i] = src[offset + i] * inv_norm;
}
// Step 3: Apply FWHT rotation
turbo_fwht_f32(tmp, TURBO_HEAD_DIM);
// Step 4: Scalar quantize + pack, one block at a time
for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) {
uint8_t indices[32];
for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
indices[i] = turbo_quantize_scalar(tmp[blk * TURBO3_BLOCK_SIZE + i], turbo_codebook_3bit, 8);
}
// Store same norm in every block of this chunk
dst[blocks_done].d = GGML_FP32_TO_FP16(norm);
turbo_pack3(indices, dst[blocks_done].qs);
blocks_done++;
}
}
}
static void quantize_row_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) {
quantize_row_turbo3_0_ref(src, (block_turbo3_0 *)dst, k);
}
void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % TURBO3_BLOCK_SIZE == 0);
const int64_t num_blocks = k / TURBO3_BLOCK_SIZE;
// Pass 1: Unpack all blocks and look up centroids
for (int64_t b = 0; b < num_blocks; b++) {
uint8_t indices[32];
turbo_unpack3(x[b].qs, indices);
for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
y[b * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]];
}
}
// Pass 2: Inverse FWHT per head_dim chunk, then scale by norm
for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
int chunk = TURBO_HEAD_DIM;
if (offset + chunk > k) chunk = (int)(k - offset);
// Inverse FWHT (self-inverse with 1/sqrt(n) normalization)
turbo_fwht_f32(y + offset, chunk);
// Read norm from the first block of this chunk
float norm = GGML_FP16_TO_FP32(x[offset / TURBO3_BLOCK_SIZE].d);
for (int i = 0; i < chunk; i++) {
y[offset + i] *= norm;
}
}
}
// ============================================================
// TurboQuant TURBO4_0 (4-bit, 4.5 bpw)
// ============================================================
void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT src, block_turbo4_0 * GGML_RESTRICT dst, int64_t k) {
assert(k % TURBO4_BLOCK_SIZE == 0);
assert(k % TURBO_HEAD_DIM == 0);
float tmp[TURBO_HEAD_DIM];
int64_t blocks_done = 0;
for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
// Step 1: Compute L2 norm of this head_dim chunk
float sum_sq = 0.0f;
for (int i = 0; i < TURBO_HEAD_DIM; i++) {
sum_sq += src[offset + i] * src[offset + i];
}
float norm = sqrtf(sum_sq);
// Step 2: Normalize the chunk
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
for (int i = 0; i < TURBO_HEAD_DIM; i++) {
tmp[i] = src[offset + i] * inv_norm;
}
// Step 3: Apply FWHT rotation
turbo_fwht_f32(tmp, TURBO_HEAD_DIM);
// Step 4: Scalar quantize + pack, one block at a time
for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) {
uint8_t indices[32];
for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
indices[i] = turbo_quantize_scalar(tmp[blk * TURBO4_BLOCK_SIZE + i], turbo_codebook_4bit, 16);
}
// Store same norm in every block of this chunk
dst[blocks_done].d = GGML_FP32_TO_FP16(norm);
turbo_pack4(indices, dst[blocks_done].qs);
blocks_done++;
}
}
}
static void quantize_row_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) {
quantize_row_turbo4_0_ref(src, (block_turbo4_0 *)dst, k);
}
void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % TURBO4_BLOCK_SIZE == 0);
const int64_t num_blocks = k / TURBO4_BLOCK_SIZE;
// Pass 1: Unpack all blocks and look up centroids
for (int64_t b = 0; b < num_blocks; b++) {
uint8_t indices[32];
turbo_unpack4(x[b].qs, indices);
for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
y[b * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]];
}
}
// Pass 2: Inverse FWHT per head_dim chunk, then scale by norm
for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
int chunk = TURBO_HEAD_DIM;
if (offset + chunk > k) chunk = (int)(k - offset);
// Inverse FWHT (self-inverse with 1/sqrt(n) normalization)
turbo_fwht_f32(y + offset, chunk);
// Read norm from the first block of this chunk
float norm = GGML_FP16_TO_FP32(x[offset / TURBO4_BLOCK_SIZE].d);
for (int i = 0; i < chunk; i++) {
y[offset + i] *= norm;
}
}
}
// ============================================================
// TurboQuant vec_dot (for flash attention compatibility)
// ============================================================
void ggml_vec_dot_turbo3_0(int n, float * GGML_RESTRICT s, size_t bs,
const void * GGML_RESTRICT vx, size_t bx,
const void * GGML_RESTRICT vy, size_t by, int nrc) {
const block_turbo3_0 *x = (const block_turbo3_0 *)vx;
const float *y = (const float *)vy;
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
// Dequantize x into temp buffer (includes inverse FWHT + norm scaling),
// then compute dot product with y.
float tmp[TURBO_HEAD_DIM];
float sum = 0.0f;
for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) {
int chunk = TURBO_HEAD_DIM;
if (offset + chunk > n) chunk = (int)(n - offset);
int64_t base_block = offset / TURBO3_BLOCK_SIZE;
// Unpack + centroid lookup for this chunk
for (int blk = 0; blk < chunk / TURBO3_BLOCK_SIZE; blk++) {
uint8_t indices[32];
turbo_unpack3(x[base_block + blk].qs, indices);
for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
tmp[blk * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]];
}
}
// Inverse FWHT
turbo_fwht_f32(tmp, chunk);
// Scale by norm and accumulate dot product
float norm = GGML_FP16_TO_FP32(x[base_block].d);
for (int i = 0; i < chunk; i++) {
sum += tmp[i] * norm * y[offset + i];
}
}
*s = sum;
}
void ggml_vec_dot_turbo4_0(int n, float * GGML_RESTRICT s, size_t bs,
const void * GGML_RESTRICT vx, size_t bx,
const void * GGML_RESTRICT vy, size_t by, int nrc) {
const block_turbo4_0 *x = (const block_turbo4_0 *)vx;
const float *y = (const float *)vy;
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
// Dequantize x into temp buffer (includes inverse FWHT + norm scaling),
// then compute dot product with y.
float tmp[TURBO_HEAD_DIM];
float sum = 0.0f;
for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) {
int chunk = TURBO_HEAD_DIM;
if (offset + chunk > n) chunk = (int)(n - offset);
int64_t base_block = offset / TURBO4_BLOCK_SIZE;
// Unpack + centroid lookup for this chunk
for (int blk = 0; blk < chunk / TURBO4_BLOCK_SIZE; blk++) {
uint8_t indices[32];
turbo_unpack4(x[base_block + blk].qs, indices);
for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
tmp[blk * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]];
}
}
// Inverse FWHT
turbo_fwht_f32(tmp, chunk);
// Scale by norm and accumulate dot product
float norm = GGML_FP16_TO_FP32(x[base_block].d);
for (int i = 0; i < chunk; i++) {
sum += tmp[i] * norm * y[offset + i];
}
}
*s = sum;
}
//
// 2-6 bit quantization in super-blocks
//
@@ -5353,6 +5689,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
} break;
case GGML_TYPE_TURBO3_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo3_0, data, nb);
} break;
case GGML_TYPE_TURBO4_0:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo4_0, data, nb);
} break;
case GGML_TYPE_IQ1_S:
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
+6
View File
@@ -24,6 +24,9 @@ GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 *
GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
@@ -51,6 +54,9 @@ GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GG
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+16
View File
@@ -904,6 +904,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.type_size = 0,
.is_quantized = false,
},
[GGML_TYPE_TURBO3_0] = {
.type_name = "turbo3",
.blck_size = TURBO3_BLOCK_SIZE,
.type_size = sizeof(block_turbo3_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_turbo3_0,
.from_float_ref = (ggml_from_float_t) quantize_row_turbo3_0_ref,
},
[GGML_TYPE_TURBO4_0] = {
.type_name = "turbo4",
.blck_size = TURBO4_BLOCK_SIZE,
.type_size = sizeof(block_turbo4_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_turbo4_0,
.from_float_ref = (ggml_from_float_t) quantize_row_turbo4_0_ref,
},
};
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
+16
View File
@@ -132,6 +132,22 @@ llama_kv_cache::llama_kv_cache(
throw std::runtime_error("failed to create ggml context for kv cache");
}
// TurboQuant requires head_dim=128 for the FWHT transform
if (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0) {
const uint32_t n_embd_head_k = hparams.n_embd_head_k(il);
if (n_embd_head_k != 128) {
LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_k, il);
throw std::runtime_error("turbo types require head_dim=128");
}
}
if (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0) {
const uint32_t n_embd_head_v = hparams.n_embd_head_v(il);
if (n_embd_head_v != 128) {
LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_v, il);
throw std::runtime_error("turbo types require head_dim=128");
}
}
const bool has_k = true;
const bool has_v = !is_mla;
+4
View File
@@ -254,6 +254,10 @@ if (NOT GGML_BACKEND_DL)
llama_build_and_test(test-quantize-fns.cpp)
llama_build_and_test(test-quantize-perf.cpp)
llama_build_and_test(test-rope.cpp)
# TurboQuant CPU reference tests (FWHT, MSE, bitpack)
llama_build(test-turboquant.cpp)
llama_test(test-turboquant)
endif()
# libmtmd
+217
View File
@@ -0,0 +1,217 @@
// TurboQuant CPU reference tests: FWHT self-inverse, roundtrip MSE, bit-packing
//
// Validates that the quantize/dequantize pipeline in ggml-quants.c produces
// MSE*d values consistent with the paper (Zandieh et al., ICLR 2026).
#include "ggml.h"
#undef NDEBUG
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <vector>
static constexpr int HEAD_DIM = 128;
static constexpr int BLOCK_SIZE = 32;
static constexpr int BLOCKS_PER_CHUNK = HEAD_DIM / BLOCK_SIZE;
static constexpr int N_VECTORS = 10000;
static constexpr int N_ELEMENTS = N_VECTORS * HEAD_DIM;
// Expected MSE*d ranges (paper: TQ3 ~0.034, TQ4 ~0.009 for d=128)
static constexpr float TQ3_MSE_D_MIN = 0.025f;
static constexpr float TQ3_MSE_D_MAX = 0.045f;
static constexpr float TQ4_MSE_D_MIN = 0.005f;
static constexpr float TQ4_MSE_D_MAX = 0.015f;
// ============================================================
// FWHT reference (must match ggml-quants.c)
// ============================================================
static void fwht_f32(float * x, int n) {
for (int h = 1; h < n; h *= 2) {
for (int i = 0; i < n; i += h * 2) {
for (int j = i; j < i + h; j++) {
float a = x[j];
float b = x[j + h];
x[j] = a + b;
x[j + h] = a - b;
}
}
}
float scale = 1.0f / sqrtf((float)n);
for (int i = 0; i < n; i++) {
x[i] *= scale;
}
}
// ============================================================
// Test 1: FWHT self-inverse (FWHT(FWHT(x)) == x)
// ============================================================
static int test_fwht_self_inverse(void) {
printf(" FWHT self-inverse (d=%d)... ", HEAD_DIM);
float orig[HEAD_DIM];
float work[HEAD_DIM];
for (int i = 0; i < HEAD_DIM; i++) {
orig[i] = sinf((float)(i + 1) * 0.7f) * 2.0f;
}
memcpy(work, orig, sizeof(orig));
fwht_f32(work, HEAD_DIM);
fwht_f32(work, HEAD_DIM);
float max_err = 0.0f;
for (int i = 0; i < HEAD_DIM; i++) {
float err = fabsf(work[i] - orig[i]);
if (err > max_err) { max_err = err; }
}
bool pass = max_err < 1e-5f;
printf("max_err=%.2e %s\n", max_err, pass ? "ok" : "FAILED");
return pass ? 0 : 1;
}
// ============================================================
// Test 2 & 3: Roundtrip MSE for TQ3 and TQ4
// ============================================================
static void generate_random_vectors(float * dst, int n_elements, unsigned int seed) {
unsigned int state = seed;
for (int i = 0; i < n_elements; i++) {
state = state * 1664525u + 1013904223u;
dst[i] = ((float)(state >> 8) / (float)(1 << 24)) * 2.0f - 1.0f;
}
}
static int test_roundtrip_mse(ggml_type type, float mse_d_min, float mse_d_max) {
const char * name = ggml_type_name(type);
printf(" %s roundtrip MSE*d (n=%d, d=%d)... ", name, N_VECTORS, HEAD_DIM);
const ggml_type_traits * traits = ggml_get_type_traits(type);
assert(traits->from_float_ref != nullptr);
assert(traits->to_float != nullptr);
std::vector<float> src(N_ELEMENTS);
std::vector<float> dst(N_ELEMENTS);
size_t quant_size = (size_t)N_ELEMENTS / BLOCK_SIZE * ggml_type_size(type);
std::vector<uint8_t> quant(quant_size);
generate_random_vectors(src.data(), N_ELEMENTS, 42);
traits->from_float_ref(src.data(), quant.data(), N_ELEMENTS);
traits->to_float(quant.data(), dst.data(), N_ELEMENTS);
// MSE*d = E[ ||x - x̃||² / ||x||² ] (normalized reconstruction error)
double total_nmse = 0.0;
for (int v = 0; v < N_VECTORS; v++) {
double err_sq = 0.0;
double norm_sq = 0.0;
for (int i = 0; i < HEAD_DIM; i++) {
double diff = (double)src[v * HEAD_DIM + i] - (double)dst[v * HEAD_DIM + i];
err_sq += diff * diff;
norm_sq += (double)src[v * HEAD_DIM + i] * (double)src[v * HEAD_DIM + i];
}
if (norm_sq > 1e-20) {
total_nmse += err_sq / norm_sq;
}
}
float mse_d = (float)(total_nmse / N_VECTORS);
bool pass = mse_d >= mse_d_min && mse_d <= mse_d_max;
printf("MSE*d=%.4f [%.3f..%.3f] %s\n", mse_d, mse_d_min, mse_d_max, pass ? "ok" : "FAILED");
return pass ? 0 : 1;
}
// ============================================================
// Test 4: Bit-pack determinism and sanity
// ============================================================
static int test_bitpack_deterministic(ggml_type type) {
const char * name = ggml_type_name(type);
printf(" %s pack determinism... ", name);
const ggml_type_traits * traits = ggml_get_type_traits(type);
float src[HEAD_DIM];
for (int i = 0; i < HEAD_DIM; i++) {
src[i] = cosf((float)i * 0.31415f);
}
size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type);
std::vector<uint8_t> q1(qsize);
std::vector<uint8_t> q2(qsize);
traits->from_float_ref(src, q1.data(), HEAD_DIM);
traits->from_float_ref(src, q2.data(), HEAD_DIM);
bool pass = memcmp(q1.data(), q2.data(), qsize) == 0;
printf("%s\n", pass ? "ok" : "FAILED (non-deterministic)");
return pass ? 0 : 1;
}
static int test_bitpack_sanity(ggml_type type) {
const char * name = ggml_type_name(type);
printf(" %s dequantize sanity... ", name);
const ggml_type_traits * traits = ggml_get_type_traits(type);
float src[HEAD_DIM];
float dst[HEAD_DIM];
for (int i = 0; i < HEAD_DIM; i++) {
src[i] = cosf((float)i * 0.31415f);
}
size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type);
std::vector<uint8_t> q(qsize);
traits->from_float_ref(src, q.data(), HEAD_DIM);
traits->to_float(q.data(), dst, HEAD_DIM);
bool all_finite = true;
bool any_nonzero = false;
for (int i = 0; i < HEAD_DIM; i++) {
if (!std::isfinite(dst[i])) { all_finite = false; }
if (fabsf(dst[i]) > 1e-10f) { any_nonzero = true; }
}
bool pass = all_finite && any_nonzero;
printf("finite=%s nonzero=%s %s\n",
all_finite ? "yes" : "NO",
any_nonzero ? "yes" : "NO",
pass ? "ok" : "FAILED");
return pass ? 0 : 1;
}
// ============================================================
// Main
// ============================================================
int main(void) {
printf("TurboQuant CPU reference tests\n");
printf("==============================\n\n");
int n_fail = 0;
printf("Test 1: FWHT self-inverse\n");
n_fail += test_fwht_self_inverse();
printf("\nTest 2: TQ3 roundtrip MSE\n");
n_fail += test_roundtrip_mse(GGML_TYPE_TURBO3_0, TQ3_MSE_D_MIN, TQ3_MSE_D_MAX);
printf("\nTest 3: TQ4 roundtrip MSE\n");
n_fail += test_roundtrip_mse(GGML_TYPE_TURBO4_0, TQ4_MSE_D_MIN, TQ4_MSE_D_MAX);
printf("\nTest 4: Bit-pack tests\n");
n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO3_0);
n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO4_0);
n_fail += test_bitpack_sanity(GGML_TYPE_TURBO3_0);
n_fail += test_bitpack_sanity(GGML_TYPE_TURBO4_0);
printf("\n==============================\n");
printf("%d/%d tests passed\n", 7 - n_fail, 7);
return n_fail > 0 ? 1 : 0;
}
+6
View File
@@ -483,6 +483,12 @@ static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "turbo3") {
return GGML_TYPE_TURBO3_0;
}
if (s == "turbo4") {
return GGML_TYPE_TURBO4_0;
}
return GGML_TYPE_COUNT;
}