diff --git a/common/arg.cpp b/common/arg.cpp index 538d2a4b0..366b148f2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -387,6 +387,8 @@ const std::vector kv_cache_types = { GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, + GGML_TYPE_TURBO3_0, + GGML_TYPE_TURBO4_0, }; static ggml_type kv_cache_type_from_str(const std::string & s) { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 669f66b65..5792e8969 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -428,7 +428,9 @@ extern "C" { // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) - GGML_TYPE_COUNT = 41, + GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV-cache (3.5 bpw) + GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV-cache (4.5 bpw) + GGML_TYPE_COUNT = 43, }; // precision diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 92cf739e7..79d635024 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -205,6 +205,26 @@ typedef struct { } block_nvfp4; static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding"); +// TurboQuant 3-bit KV-cache quantization (3.5 bpw) +#define TURBO3_BLOCK_SIZE 32 +#define QK_TURBO3 32 +#define QR_TURBO3 2 +typedef struct { + ggml_half d; // FP16 L2-norm + uint8_t qs[12]; // 32 x 3-bit packed indices +} block_turbo3_0; +static_assert(sizeof(block_turbo3_0) == 14, "wrong turbo3 block size"); + +// TurboQuant 4-bit KV-cache quantization (4.5 bpw) +#define TURBO4_BLOCK_SIZE 32 +#define QK_TURBO4 32 +#define QR_TURBO4 2 +typedef struct { + ggml_half d; // FP16 L2-norm + uint8_t qs[16]; // 32 x 4-bit packed indices +} block_turbo4_0; +static_assert(sizeof(block_turbo4_0) == 18, "wrong turbo4 block size"); + #define QK5_0 32 typedef struct { ggml_half d; // delta diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 419862101..1824afe4a 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -120,7 +120,9 @@ if (CUDAToolkit_FOUND) template-instances/fattn-vec-instance-f16-f16.cu template-instances/fattn-vec-instance-q4_0-q4_0.cu template-instances/fattn-vec-instance-q8_0-q8_0.cu - template-instances/fattn-vec-instance-bf16-bf16.cu) + template-instances/fattn-vec-instance-bf16-bf16.cu + template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu + template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu) endif() ggml_add_backend_library(ggml-cuda diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 79ccfe568..1aa64b65d 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -656,6 +656,140 @@ static void dequantize_row_nvfp4_cuda( const int nb = k / QK_NVFP4; dequantize_block_nvfp4<<>>(vx, y, k); } +// ============================================================ +// TurboQuant GPU bulk dequantize kernels (with FWHT) +// ============================================================ + +// Each CUDA block processes one 128-element chunk (= 4 turbo blocks). +// 128 threads per block, one thread per element. +// Step 1: unpack index + centroid lookup -> shared memory +// Step 2: FWHT butterfly in shared memory (7 stages for n=128) +// Step 3: normalize by 1/sqrt(128) and scale by stored norm +// Step 4: write to output + +#define TURBO_HEAD_DIM_GPU 128 +#define TURBO_BLOCKS_PER_CHUNK_GPU (TURBO_HEAD_DIM_GPU / 32) // 4 + +template +static __global__ void dequantize_block_turbo3_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { + __shared__ float smem[TURBO_HEAD_DIM_GPU]; + + const int64_t chunk_idx = blockIdx.x; + const int tid = threadIdx.x; // 0..127 + + const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU; + if (output_offset + tid >= k) return; + + // Which of the 4 blocks within this chunk does this thread belong to? + const int local_block = tid / TURBO3_BLOCK_SIZE; // 0..3 + const int elem_in_block = tid % TURBO3_BLOCK_SIZE; // 0..31 + + const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block; + + // Unpack 3-bit index and look up centroid + const block_turbo3_0 * x = (const block_turbo3_0 *)vx + global_block_idx; + const uint8_t * qs = x->qs; + + int bit_off = elem_in_block * 3; + int byte_idx = bit_off / 8; + int shift = bit_off % 8; + uint16_t raw = (uint16_t)qs[byte_idx] >> shift; + if (shift > 5 && byte_idx + 1 < 12) + raw |= (uint16_t)qs[byte_idx + 1] << (8 - shift); + uint8_t idx = (uint8_t)(raw & 0x07); + + smem[tid] = dc_codebook_3bit[idx]; + __syncthreads(); + + // FWHT butterfly stages (7 stages for n=128) + for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) { + if (tid < 64) { // 128/2 = 64 butterflies per stage + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + float a = smem[i]; + float b = smem[i + h]; + smem[i] = a + b; + smem[i + h] = a - b; + } + __syncthreads(); + } + + // Normalize by 1/sqrt(128) and scale by stored norm + const float fwht_scale = 0.08838834764831844f; // 1/sqrt(128) + const block_turbo3_0 * first_block = (const block_turbo3_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU; + float norm = __half2float(first_block->d); + smem[tid] *= fwht_scale * norm; + __syncthreads(); + + // Write to output + y[output_offset + tid] = ggml_cuda_cast(smem[tid]); +} + +template +static void dequantize_row_turbo3_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0); + const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU); + dequantize_block_turbo3_0_kernel<<>>(vx, y, k); +} + +template +static __global__ void dequantize_block_turbo4_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { + __shared__ float smem[TURBO_HEAD_DIM_GPU]; + + const int64_t chunk_idx = blockIdx.x; + const int tid = threadIdx.x; // 0..127 + + const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU; + if (output_offset + tid >= k) return; + + // Which of the 4 blocks within this chunk does this thread belong to? + const int local_block = tid / TURBO4_BLOCK_SIZE; // 0..3 + const int elem_in_block = tid % TURBO4_BLOCK_SIZE; // 0..31 + + const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block; + + // Unpack 4-bit index and look up centroid + const block_turbo4_0 * x = (const block_turbo4_0 *)vx + global_block_idx; + int pair_idx = elem_in_block / 2; + uint8_t packed = x->qs[pair_idx]; + uint8_t idx = (elem_in_block & 1) ? ((packed >> 4) & 0x0F) : (packed & 0x0F); + + smem[tid] = dc_codebook_4bit[idx]; + __syncthreads(); + + // FWHT butterfly stages (7 stages for n=128) + for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) { + if (tid < 64) { // 128/2 = 64 butterflies per stage + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + float a = smem[i]; + float b = smem[i + h]; + smem[i] = a + b; + smem[i + h] = a - b; + } + __syncthreads(); + } + + // Normalize by 1/sqrt(128) and scale by stored norm + const float fwht_scale = 0.08838834764831844f; // 1/sqrt(128) + const block_turbo4_0 * first_block = (const block_turbo4_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU; + float norm = __half2float(first_block->d); + smem[tid] *= fwht_scale * norm; + __syncthreads(); + + // Write to output + y[output_offset + tid] = ggml_cuda_cast(smem[tid]); +} + +template +static void dequantize_row_turbo4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0); + const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU); + dequantize_block_turbo4_0_kernel<<>>(vx, y, k); +} + template static __global__ void convert_unary( const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, @@ -756,6 +890,10 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_mxfp4_cuda; case GGML_TYPE_NVFP4: return dequantize_row_nvfp4_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_row_turbo3_0_cuda; + case GGML_TYPE_TURBO4_0: + return dequantize_row_turbo4_0_cuda; case GGML_TYPE_F32: return convert_unary_cont_cuda; case GGML_TYPE_BF16: @@ -809,6 +947,10 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_mxfp4_cuda; case GGML_TYPE_NVFP4: return dequantize_row_nvfp4_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_row_turbo3_0_cuda; + case GGML_TYPE_TURBO4_0: + return dequantize_row_turbo4_0_cuda; case GGML_TYPE_F16: return convert_unary_cont_cuda; case GGML_TYPE_BF16: @@ -832,6 +974,10 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: return dequantize_block_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cuda; + case GGML_TYPE_TURBO4_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: @@ -853,6 +999,10 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: return dequantize_block_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cuda; + case GGML_TYPE_TURBO4_0: + return dequantize_block_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -874,6 +1024,10 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: return dequantize_block_cuda; + case GGML_TYPE_TURBO3_0: + return dequantize_block_cuda; + case GGML_TYPE_TURBO4_0: + return dequantize_block_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh index 7697c292d..108e78acb 100644 --- a/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh @@ -211,6 +211,93 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti); } +// ============================================================ +// TurboQuant GPU quantize device functions +// ============================================================ + +// Device-side codebook references (same values as in dequantize.cuh) +// These are declared extern to reference the __constant__ arrays from dequantize.cuh +// Note: we re-declare small local codebooks here to avoid linkage issues. +__device__ static const float tq_codebook_3bit_q[8] = { + -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f, + 0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f +}; + +__device__ static const float tq_codebook_4bit_q[16] = { + -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f, + -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f, + 0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f, + 0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f +}; + +static __device__ uint8_t tq_nearest_codebook(float val, const float *codebook, int n) { + float best_dist = fabsf(val - codebook[0]); + uint8_t best_idx = 0; + for (int i = 1; i < n; i++) { + float dist = fabsf(val - codebook[i]); + if (dist < best_dist) { + best_dist = dist; + best_idx = (uint8_t)i; + } + } + return best_idx; +} + +static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ x, block_turbo3_0 * __restrict__ y) { + // Compute block norm + float sum_sq = 0.0f; + for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) { + sum_sq += x[j] * x[j]; + } + float norm = sqrtf(sum_sq); + y->d = __float2half(norm); + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + + // Quantize NORMALIZED elements to nearest 3-bit codebook entry and pack + uint8_t indices[32]; + for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) { + indices[j] = tq_nearest_codebook(x[j] * inv_norm, tq_codebook_3bit_q, 8); + } + + // Pack 32 x 3-bit values into 12 bytes + memset(y->qs, 0, 12); + for (int j = 0; j < 32; j++) { + int bit_off = j * 3; + int byte_idx = bit_off / 8; + int shift = bit_off % 8; + y->qs[byte_idx] |= (uint8_t)((indices[j] & 0x07) << shift); + if (shift > 5 && byte_idx + 1 < 12) { + y->qs[byte_idx + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift)); + } + } +} + +static __device__ void quantize_f32_turbo4_0_block(const float * __restrict__ x, block_turbo4_0 * __restrict__ y) { + // Compute block norm + float sum_sq = 0.0f; + for (int j = 0; j < TURBO4_BLOCK_SIZE; j++) { + sum_sq += x[j] * x[j]; + } + float norm = sqrtf(sum_sq); + y->d = __float2half(norm); + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + + // Quantize NORMALIZED elements to nearest 4-bit codebook entry and pack + for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) { + uint8_t idx0 = tq_nearest_codebook(x[2*j] * inv_norm, tq_codebook_4bit_q, 16); + uint8_t idx1 = tq_nearest_codebook(x[2*j + 1] * inv_norm, tq_codebook_4bit_q, 16); + y->qs[j] = (idx0 & 0x0F) | ((idx1 & 0x0F) << 4); + } +} + +static __device__ void cpy_blck_f32_turbo3_0(const char * cxi, char * cdsti) { + quantize_f32_turbo3_0_block((const float *)cxi, (block_turbo3_0 *)cdsti); +} + +static __device__ void cpy_blck_f32_turbo4_0(const char * cxi, char * cdsti) { + quantize_f32_turbo4_0_block((const float *)cxi, (block_turbo4_0 *)cdsti); +} + template static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) { *(dst_t *) cdsti = ggml_cuda_cast(*(const src_t *) cxi); diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh index e060fb29f..1e4353f43 100644 --- a/ggml/src/ggml-cuda/dequantize.cuh +++ b/ggml/src/ggml-cuda/dequantize.cuh @@ -75,3 +75,67 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in v.x *= d; v.y *= d; } + +// ============================================================ +// TurboQuant GPU dequantize device functions +// ============================================================ + +__device__ __constant__ static float dc_codebook_3bit[8] = { + -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f, + 0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f +}; + +__device__ __constant__ static float dc_codebook_4bit[16] = { + -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f, + -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f, + 0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f, + 0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f +}; + +static __device__ __forceinline__ void dequantize_turbo3_0( + const void * vx, const int64_t ib, const int iqs, float2 & v) +{ + const block_turbo3_0 * x = (const block_turbo3_0 *) vx + ib; + const uint8_t * qs = x->qs; + + // Unpack two consecutive 3-bit indices + int elem0 = iqs * 2; + int elem1 = iqs * 2 + 1; + + // Extract 3-bit value for elem0 + int bit_off0 = elem0 * 3; + int byte0 = bit_off0 / 8; + int shift0 = bit_off0 % 8; + uint16_t raw0 = (uint16_t)qs[byte0] >> shift0; + if (shift0 > 5 && byte0 + 1 < 12) + raw0 |= (uint16_t)qs[byte0 + 1] << (8 - shift0); + uint8_t idx0 = (uint8_t)(raw0 & 0x07); + + // Extract 3-bit value for elem1 + int bit_off1 = elem1 * 3; + int byte1 = bit_off1 / 8; + int shift1 = bit_off1 % 8; + uint16_t raw1 = (uint16_t)qs[byte1] >> shift1; + if (shift1 > 5 && byte1 + 1 < 12) + raw1 |= (uint16_t)qs[byte1 + 1] << (8 - shift1); + uint8_t idx1 = (uint8_t)(raw1 & 0x07); + + const float norm = __half2float(x->d); + v.x = dc_codebook_3bit[idx0] * norm; + v.y = dc_codebook_3bit[idx1] * norm; +} + +static __device__ __forceinline__ void dequantize_turbo4_0( + const void * vx, const int64_t ib, const int iqs, float2 & v) +{ + const block_turbo4_0 * x = (const block_turbo4_0 *) vx + ib; + + // 4-bit: 2 values per byte, simple nibble extraction + uint8_t packed = x->qs[iqs]; + uint8_t idx0 = packed & 0x0F; + uint8_t idx1 = (packed >> 4) & 0x0F; + + const float norm = __half2float(x->d); + v.x = dc_codebook_4bit[idx0] * norm; + v.y = dc_codebook_4bit[idx1] * norm; +} diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index c59a4db39..c9eeba987 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -3,6 +3,7 @@ #include "common.cuh" #include "convert.cuh" #include "vecdotq.cuh" +#include "dequantize.cuh" #include @@ -577,6 +578,59 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict } } +// ============================================================ +// TurboQuant V-cache dequantize functions for flash attention +// ============================================================ + +template +static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { + const block_turbo3_0 * x = (const block_turbo3_0 *) vx; + + const int64_t ib = i0 / QK_TURBO3; + const int iqs = (int)(i0 % QK_TURBO3) / 2; + + static_assert(ne % 2 == 0, "bad ne"); + T * dst_t = (T *) dst; + +#pragma unroll + for (int l = 0; l < ne/2; ++l) { + float2 v; + dequantize_turbo3_0(vx, ib, iqs + l, v); + if constexpr (std::is_same_v) { + dst_t[2*l + 0] = __float2half(v.x); + dst_t[2*l + 1] = __float2half(v.y); + } else { + dst_t[2*l + 0] = (T)v.x; + dst_t[2*l + 1] = (T)v.y; + } + } +} + +template +static __device__ __forceinline__ void dequantize_V_turbo4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { + const block_turbo4_0 * x = (const block_turbo4_0 *) vx; + + const int64_t ib = i0 / QK_TURBO4; + const int iqs = (int)(i0 % QK_TURBO4) / 2; + + static_assert(ne % 2 == 0, "bad ne"); + T * dst_t = (T *) dst; + +#pragma unroll + for (int l = 0; l < ne/2; ++l) { + float2 v; + dequantize_turbo4_0(vx, ib, iqs + l, v); + if constexpr (std::is_same_v) { + dst_t[2*l + 0] = __float2half(v.x); + dst_t[2*l + 1] = __float2half(v.y); + } else { + dst_t[2*l + 0] = (T)v.x; + dst_t[2*l + 1] = (T)v.y; + } + } + GGML_UNUSED(x); +} + template constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() { if constexpr (type_K == GGML_TYPE_F16) { @@ -593,6 +647,12 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() { return vec_dot_fattn_vec_KQ_q8_0; } else if constexpr (type_K == GGML_TYPE_BF16) { return vec_dot_fattn_vec_KQ_bf16; + // TurboQuant K-cache: Phase 1 - dequantize to FP16 before attention + // (use FP16 dot product after conversion in the dispatch layer) + } else if constexpr (type_K == GGML_TYPE_TURBO3_0) { + return vec_dot_fattn_vec_KQ_f16; + } else if constexpr (type_K == GGML_TYPE_TURBO4_0) { + return vec_dot_fattn_vec_KQ_f16; } else { static_assert(type_K == -1, "bad type"); return nullptr; @@ -615,6 +675,10 @@ constexpr __device__ dequantize_V_t get_dequantize_V() { return dequantize_V_q8_0; } else if constexpr (type_V == GGML_TYPE_BF16) { return dequantize_V_bf16; + } else if constexpr (type_V == GGML_TYPE_TURBO3_0) { + return dequantize_V_turbo3_0; + } else if constexpr (type_V == GGML_TYPE_TURBO4_0) { + return dequantize_V_turbo4_0; } else { static_assert(type_V == -1, "bad type"); return nullptr; diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index f0bd42a57..13981deb2 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -75,17 +75,20 @@ static __global__ void flash_attn_ext_vec( #endif // GGML_USE_HIP constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device(); - constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q; - constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q; + constexpr bool K_is_fp_like = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16); + constexpr bool V_is_fp_like = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16); + + constexpr int nthreads_KQ = K_is_fp_like ? 128 / cpy_nb : nthreads_KQ_q; + constexpr int nthreads_V = V_is_fp_like ? 128 / cpy_nb : nthreads_V_q; static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K"); static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V"); - constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4; + constexpr int V_rows_per_thread = V_is_fp_like ? 2*cpy_ne : 4; constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V; constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ(); - constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16; + constexpr bool Q_q8_1 = !K_is_fp_like; #ifdef V_DOT2_F32_F16_AVAILABLE constexpr dequantize_V_t dequantize_V = get_dequantize_V(); #else @@ -598,3 +601,12 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16) + +// TurboQuant extern declarations (homogeneous K/V only) +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); + +extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index a25a890db..e4840d147 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -5,6 +5,7 @@ #include "fattn-vec.cuh" #include "fattn-wmma-f16.cuh" #include "fattn.cuh" +#include "convert.cuh" template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -273,6 +274,8 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16) + + // TurboQuant: pre-dequantized to f16 before FA, no turbo vec cases needed #else FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) @@ -292,6 +295,10 @@ enum best_fattn_kernel { BEST_FATTN_KERNEL_MMA_F16 = 400, }; +static bool ggml_type_is_turbo(ggml_type type) { + return type == GGML_TYPE_TURBO3_0 || type == GGML_TYPE_TURBO4_0; +} + static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ggml_tensor * dst) { #ifndef FLASH_ATTN_AVAILABLE GGML_UNUSED(device); GGML_UNUSED(dst); @@ -353,8 +360,13 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } #ifndef GGML_CUDA_FA_ALL_QUANTS - if (K->type != V->type) { - return BEST_FATTN_KERNEL_NONE; + { + // Turbo types are pre-dequantized to f16, so treat them as f16 for type matching + const ggml_type eff_k = ggml_type_is_turbo(K->type) ? GGML_TYPE_F16 : K->type; + const ggml_type eff_v = ggml_type_is_turbo(V->type) ? GGML_TYPE_F16 : V->type; + if (eff_k != eff_v) { + return BEST_FATTN_KERNEL_NONE; + } } #endif // GGML_CUDA_FA_ALL_QUANTS @@ -372,6 +384,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const case GGML_TYPE_Q8_0: case GGML_TYPE_BF16: break; + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: + // Turbo types are handled via pre-dequantize to f16 before FA + break; default: return BEST_FATTN_KERNEL_NONE; } @@ -485,8 +501,72 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const return BEST_FATTN_KERNEL_TILE; } +// Pre-dequantize a turbo tensor to f16, returning a stack-allocated tensor copy. +// The caller must keep pool_buf alive until after FA completes. +static ggml_tensor turbo_pre_dequantize( + const ggml_tensor * src, + ggml_cuda_pool_alloc & pool_buf, + cudaStream_t stream) { + const int64_t n_elements = ggml_nelements(src); + + pool_buf.alloc(n_elements); + + to_fp16_cuda_t dequant = ggml_get_to_fp16_cuda(src->type); + GGML_ASSERT(dequant != nullptr); + dequant(src->data, pool_buf.ptr, n_elements, stream); + + // Scale existing strides from turbo block layout to f16 element layout. + // This preserves any permutation (e.g. ggml_permute swapping dims 1 and 2). + // The dequantized f16 data is in the same physical order as the turbo data, + // so the stride relationships must be preserved, just rescaled. + const size_t bs = ggml_blck_size(src->type); + const size_t ts = ggml_type_size(src->type); + + ggml_tensor tmp = *src; + tmp.type = GGML_TYPE_F16; + tmp.data = pool_buf.ptr; + tmp.nb[0] = sizeof(half); + tmp.nb[1] = src->nb[1] * bs * sizeof(half) / ts; + tmp.nb[2] = src->nb[2] * bs * sizeof(half) / ts; + tmp.nb[3] = src->nb[3] * bs * sizeof(half) / ts; + tmp.view_src = nullptr; + tmp.view_offs = 0; + + return tmp; +} + void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_set_device(ctx.device); + + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; + + const bool k_is_turbo = ggml_type_is_turbo(K->type); + const bool v_is_turbo = V && ggml_type_is_turbo(V->type); + + // Pre-dequantize turbo KV to f16 so standard FA kernels can handle them. + // Pool buffers must outlive the FA dispatch (RAII frees on scope exit). + ggml_cuda_pool_alloc k_pool(ctx.pool()); + ggml_cuda_pool_alloc v_pool(ctx.pool()); + ggml_tensor k_f16, v_f16; + + cudaStream_t stream = ctx.stream(); + + // Save original src pointers + ggml_tensor * orig_k = dst->src[1]; + ggml_tensor * orig_v = dst->src[2]; + + if (k_is_turbo) { + k_f16 = turbo_pre_dequantize(K, k_pool, stream); + dst->src[1] = &k_f16; + } + if (v_is_turbo) { + v_f16 = turbo_pre_dequantize(V, v_pool, stream); + dst->src[2] = &v_f16; + } + + + // Standard FA dispatch — now sees f16 tensors switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) { case BEST_FATTN_KERNEL_NONE: GGML_ABORT("fatal error"); @@ -503,6 +583,10 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst ggml_cuda_flash_attn_ext_mma_f16(ctx, dst); break; } + + // Restore original src pointers + dst->src[1] = orig_k; + dst->src[2] = orig_v; } bool ggml_cuda_flash_attn_ext_supported(int device, const ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d1239b1c5..054bfdaf0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4842,7 +4842,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g { return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) && + op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL || + op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO4_0) && op->src[0]->type == GGML_TYPE_F32 && (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32); } break; diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 631de7e8f..306660060 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -1,8 +1,16 @@ #include "set-rows.cuh" #include "cpy-utils.cuh" +#include + typedef void (*set_rows_kernel_t)(const char * src, char * dst); +// ============================================================ +// TurboQuant constants for set-rows FWHT +// ============================================================ +#define TURBO_HEAD_DIM_SR 128 +#define TURBO_BLOCKS_PER_CHUNK_SR (TURBO_HEAD_DIM_SR / 32) // 4 + // Generic quantized set_rows kernel template template static __global__ void k_set_rows_quant(const float * __restrict__ src0, @@ -109,6 +117,398 @@ static void set_rows_cuda_quant( } } +// ============================================================ +// TurboQuant specialized set-rows kernel with FWHT +// ============================================================ +// Each CUDA block processes one 128-element chunk. +// 128 threads per block, one thread per element in the chunk. +// Steps: +// 1. Each thread reads one float from the source row +// 2. Cooperative norm computation via shared memory reduction +// 3. Normalize the chunk +// 4. FWHT butterfly in shared memory (7 stages for n=128) +// 5. Each thread scalar-quantizes its element and packs into blocks + +// Device-side codebook references for turbo quantize (same as in cpy-utils.cuh) +__device__ static const float sr_codebook_3bit[8] = { + -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f, + 0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f +}; + +__device__ static const float sr_codebook_4bit[16] = { + -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f, + -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f, + 0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f, + 0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f +}; + +static __device__ uint8_t sr_nearest_codebook(float val, const float *codebook, int n) { + float best_dist = fabsf(val - codebook[0]); + uint8_t best_idx = 0; + for (int i = 1; i < n; i++) { + float dist = fabsf(val - codebook[i]); + if (dist < best_dist) { + best_dist = dist; + best_idx = (uint8_t)i; + } + } + return best_idx; +} + +// Turbo3 set-rows kernel: processes 128-element chunks with FWHT +template +static __global__ void k_set_rows_turbo3( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo3_0 * __restrict__ dst, + const int64_t ne_total_chunks, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3, + const int64_t ne00, + const uint3 ne00_fd, + const uint3 ne01_fd, + const uint3 ne02_fd, + const uint3 ne11_fd, + const uint3 ne12_fd) { + + __shared__ float smem[TURBO_HEAD_DIM_SR]; + __shared__ float reduction[TURBO_HEAD_DIM_SR]; + + const int64_t chunk_global = blockIdx.x; + const int tid = threadIdx.x; // 0..127 + + if (chunk_global >= ne_total_chunks) return; + + // Map the global chunk index to i00 (element offset within a row) + row indices + // Each chunk covers 128 elements, so the chunk's base element = chunk_global * 128 + const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR; + uint32_t tmp = (uint32_t)elem_base; + uint2 div_mod; + + div_mod = fast_div_modulo(tmp, ne00_fd); + const int64_t i00 = div_mod.y; // offset within row (multiple of 128) + tmp = div_mod.x; + + div_mod = fast_div_modulo(tmp, ne01_fd); + const int64_t i01 = div_mod.y; + tmp = div_mod.x; + + div_mod = fast_div_modulo(tmp, ne02_fd); + const int64_t i02 = div_mod.y; + const int64_t i03 = div_mod.x; + + const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd); + const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd); + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + + const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; + const float val = src0_row[i00 + tid]; + smem[tid] = val; + + // Step 1: Compute L2 norm via parallel reduction + reduction[tid] = val * val; + __syncthreads(); + + for (int s = 64; s > 0; s >>= 1) { + if (tid < s) { + reduction[tid] += reduction[tid + s]; + } + __syncthreads(); + } + + float norm = sqrtf(reduction[0]); + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + + // Step 2: Normalize + smem[tid] *= inv_norm; + __syncthreads(); + + // Step 3: FWHT butterfly stages (7 stages for n=128) + for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) { + if (tid < 64) { + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + float a = smem[i]; + float b = smem[i + h]; + smem[i] = a + b; + smem[i + h] = a - b; + } + __syncthreads(); + } + + // Apply 1/sqrt(128) normalization + const float fwht_scale = 0.08838834764831844f; + smem[tid] *= fwht_scale; + __syncthreads(); + + // Step 4: Scalar quantize and pack into turbo3 blocks + // Each thread quantizes its element + uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_3bit, 8); + + // We need to pack 32 indices per block cooperatively + // Use shared memory to collect indices, then pack + // Reuse reduction[] as uint8 storage + ((uint8_t *)reduction)[tid] = my_idx; + __syncthreads(); + + // Compute destination block pointer + // dst layout: dst_row*s1 + i02*s2 + i03*s3 gives byte offset to row start + // Then add block offset for i00 + block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + const int64_t dst_block_base = i00 / TURBO3_BLOCK_SIZE; + + // Only 4 threads (one per block) do the packing + if (tid < TURBO_BLOCKS_PER_CHUNK_SR) { + const int blk = tid; + block_turbo3_0 * dst_block = dst_row_ptr + dst_block_base + blk; + const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32; + + // Store norm + dst_block->d = __float2half(norm); + + // Pack 32 x 3-bit indices into 12 bytes + memset(dst_block->qs, 0, 12); + for (int j = 0; j < 32; j++) { + int bit_off = j * 3; + int byte_pos = bit_off / 8; + int shift = bit_off % 8; + dst_block->qs[byte_pos] |= (uint8_t)((indices[j] & 0x07) << shift); + if (shift > 5 && byte_pos + 1 < 12) { + dst_block->qs[byte_pos + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift)); + } + } + } + + GGML_UNUSED(ne10); + GGML_UNUSED(ne11); + GGML_UNUSED(ne12); + GGML_UNUSED(ne13); +} + +// Turbo4 set-rows kernel: processes 128-element chunks with FWHT +template +static __global__ void k_set_rows_turbo4( + const float * __restrict__ src0, + const idx_t * __restrict__ src1, + block_turbo4_0 * __restrict__ dst, + const int64_t ne_total_chunks, + const int64_t ne10, + const int64_t ne11, + const int64_t ne12, + const int64_t ne13, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t s10, + const int64_t s11, + const int64_t s12, + const int64_t s1, + const int64_t s2, + const int64_t s3, + const int64_t ne00, + const uint3 ne00_fd, + const uint3 ne01_fd, + const uint3 ne02_fd, + const uint3 ne11_fd, + const uint3 ne12_fd) { + + __shared__ float smem[TURBO_HEAD_DIM_SR]; + __shared__ float reduction[TURBO_HEAD_DIM_SR]; + + const int64_t chunk_global = blockIdx.x; + const int tid = threadIdx.x; // 0..127 + + if (chunk_global >= ne_total_chunks) return; + + // Map the global chunk index to i00 (element offset within a row) + row indices + const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR; + uint32_t tmp = (uint32_t)elem_base; + uint2 div_mod; + + div_mod = fast_div_modulo(tmp, ne00_fd); + const int64_t i00 = div_mod.y; + tmp = div_mod.x; + + div_mod = fast_div_modulo(tmp, ne01_fd); + const int64_t i01 = div_mod.y; + tmp = div_mod.x; + + div_mod = fast_div_modulo(tmp, ne02_fd); + const int64_t i02 = div_mod.y; + const int64_t i03 = div_mod.x; + + const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd); + const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd); + const int64_t i10 = i01; + + const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + + const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; + const float val = src0_row[i00 + tid]; + smem[tid] = val; + + // Step 1: Compute L2 norm via parallel reduction + reduction[tid] = val * val; + __syncthreads(); + + for (int s = 64; s > 0; s >>= 1) { + if (tid < s) { + reduction[tid] += reduction[tid + s]; + } + __syncthreads(); + } + + float norm = sqrtf(reduction[0]); + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + + // Step 2: Normalize + smem[tid] *= inv_norm; + __syncthreads(); + + // Step 3: FWHT butterfly stages (7 stages for n=128) + for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) { + if (tid < 64) { + int group = tid / h; + int pos = tid % h; + int i = group * h * 2 + pos; + float a = smem[i]; + float b = smem[i + h]; + smem[i] = a + b; + smem[i + h] = a - b; + } + __syncthreads(); + } + + // Apply 1/sqrt(128) normalization + const float fwht_scale = 0.08838834764831844f; + smem[tid] *= fwht_scale; + __syncthreads(); + + // Step 4: Scalar quantize and pack into turbo4 blocks + uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_4bit, 16); + + // Collect indices in shared memory + ((uint8_t *)reduction)[tid] = my_idx; + __syncthreads(); + + // Compute destination block pointer + block_turbo4_0 * dst_row_ptr = (block_turbo4_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3); + const int64_t dst_block_base = i00 / TURBO4_BLOCK_SIZE; + + // Only 4 threads (one per block) do the packing + if (tid < TURBO_BLOCKS_PER_CHUNK_SR) { + const int blk = tid; + block_turbo4_0 * dst_block = dst_row_ptr + dst_block_base + blk; + const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32; + + // Store norm + dst_block->d = __float2half(norm); + + // Pack 32 x 4-bit indices into 16 bytes + for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) { + dst_block->qs[j] = (indices[2*j] & 0x0F) | ((indices[2*j + 1] & 0x0F) << 4); + } + } + + GGML_UNUSED(ne10); + GGML_UNUSED(ne11); + GGML_UNUSED(ne12); + GGML_UNUSED(ne13); +} + +// Dispatch functions for turbo set-rows +template +static void set_rows_cuda_turbo3( + const float * src0_d, const idx_t * src1_d, block_turbo3_0 * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + + GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0); + const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR; + const dim3 grid_size((int)ne_total_chunks); + const dim3 block_size(TURBO_HEAD_DIM_SR); + + const int64_t s01 = nb01/sizeof(float); + const int64_t s02 = nb02/sizeof(float); + const int64_t s03 = nb03/sizeof(float); + const int64_t s10 = nb10/sizeof(idx_t); + const int64_t s11 = nb11/sizeof(idx_t); + const int64_t s12 = nb12/sizeof(idx_t); + const int64_t s1 = nb1; + const int64_t s2 = nb2; + const int64_t s3 = nb3; + + if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) { + const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00); + const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01); + const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02); + const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); + const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); + + k_set_rows_turbo3<<>>( + src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, s1, s2, s3, + ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd); + } +} + +template +static void set_rows_cuda_turbo4( + const float * src0_d, const idx_t * src1_d, block_turbo4_0 * dst_d, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, + const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + + GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0); + const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR; + const dim3 grid_size((int)ne_total_chunks); + const dim3 block_size(TURBO_HEAD_DIM_SR); + + const int64_t s01 = nb01/sizeof(float); + const int64_t s02 = nb02/sizeof(float); + const int64_t s03 = nb03/sizeof(float); + const int64_t s10 = nb10/sizeof(idx_t); + const int64_t s11 = nb11/sizeof(idx_t); + const int64_t s12 = nb12/sizeof(idx_t); + const int64_t s1 = nb1; + const int64_t s2 = nb2; + const int64_t s3 = nb3; + + if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) { + const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00); + const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01); + const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02); + const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); + const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); + + k_set_rows_turbo4<<>>( + src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13, + s01, s02, s03, s10, s11, s12, s1, s2, s3, + ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd); + } +} + template static __global__ void k_set_rows(const src_t * __restrict__ src0, const idx_t * __restrict__ src1, @@ -309,6 +709,28 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s nb1, nb2, nb3, stream ); + } else if (dst->type == GGML_TYPE_TURBO3_0) { + // FWHT-aware 128-thread kernels for correct TurboQuant encoding + set_rows_cuda_turbo3( + src0_d, src1_d, (block_turbo3_0*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); + } else if (dst->type == GGML_TYPE_TURBO4_0) { + // FWHT-aware 128-thread kernels for correct TurboQuant encoding + set_rows_cuda_turbo4( + src0_d, src1_d, (block_turbo4_0*)dst->data, + ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, + nb01, nb02, nb03, + nb10, nb11, nb12, + nb1, nb2, nb3, + stream + ); } else { GGML_ABORT("unsupported type %s", ggml_type_name(dst->type)); } diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu new file mode 100644 index 000000000..f8b44cee8 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu @@ -0,0 +1,7 @@ +// TurboQuant flash attention template instances + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu new file mode 100644 index 000000000..061000d03 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu @@ -0,0 +1,7 @@ +// TurboQuant flash attention template instances + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0); diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index 291b48374..9a4a4252e 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -75,7 +75,9 @@ else() ../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu ../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu - ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu) + ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu) endif() ggml_add_backend_library(ggml-hip diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 48695a61e..52d3c2dd4 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -494,6 +494,342 @@ void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_REST } } +// ============================================================ +// TurboQuant codebook data (Lloyd-Max optimal for d=128 after WHT) +// ============================================================ + +static const float turbo_codebook_3bit[8] = { + -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f, + 0.0216041461f, 0.0665854520f, 0.1181396281f, 0.1883970748f +}; + +static const float turbo_codebook_4bit[16] = { + -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f, + -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f, + 0.0112761586f, 0.0341139667f, 0.0577250301f, 0.0827738972f, + 0.1102295202f, 0.1417455465f, 0.1807794468f, 0.2376153882f +}; + +// ============================================================ +// TurboQuant helper: pack/unpack bit-packed indices +// ============================================================ + +static void turbo_pack3(const uint8_t *indices, uint8_t *out) { + // Pack 32 x 3-bit values into 12 bytes (96 bits) + memset(out, 0, 12); + for (int i = 0; i < 32; i++) { + int bit_off = i * 3; + int byte_idx = bit_off / 8; + int shift = bit_off % 8; + out[byte_idx] |= (uint8_t)((indices[i] & 0x07) << shift); + if (shift > 5 && byte_idx + 1 < 12) { + out[byte_idx + 1] |= (uint8_t)((indices[i] & 0x07) >> (8 - shift)); + } + } +} + +static void turbo_unpack3(const uint8_t *packed, uint8_t *indices) { + // Unpack 12 bytes into 32 x 3-bit values + for (int i = 0; i < 32; i++) { + int bit_off = i * 3; + int byte_idx = bit_off / 8; + int shift = bit_off % 8; + uint16_t raw = (uint16_t)packed[byte_idx] >> shift; + if (shift > 5 && byte_idx + 1 < 12) { + raw |= (uint16_t)packed[byte_idx + 1] << (8 - shift); + } + indices[i] = (uint8_t)(raw & 0x07); + } +} + +static void turbo_pack4(const uint8_t *indices, uint8_t *out) { + // Pack 32 x 4-bit values into 16 bytes + for (int i = 0; i < 16; i++) { + out[i] = (indices[2*i] & 0x0F) | ((indices[2*i + 1] & 0x0F) << 4); + } +} + +static void turbo_unpack4(const uint8_t *packed, uint8_t *indices) { + // Unpack 16 bytes into 32 x 4-bit values + for (int i = 0; i < 16; i++) { + indices[2*i] = packed[i] & 0x0F; + indices[2*i + 1] = (packed[i] >> 4) & 0x0F; + } +} + +static uint8_t turbo_quantize_scalar(float val, const float *codebook, int n_codes) { + // Find nearest codebook entry (linear scan - codebook is sorted) + float best_dist = fabsf(val - codebook[0]); + uint8_t best_idx = 0; + for (int i = 1; i < n_codes; i++) { + float dist = fabsf(val - codebook[i]); + if (dist < best_dist) { + best_dist = dist; + best_idx = (uint8_t)i; + } + } + return best_idx; +} + +// ============================================================ +// TurboQuant FWHT (Fast Walsh-Hadamard Transform) +// Self-inverse with 1/sqrt(n) normalization. +// ============================================================ + +#define TURBO_HEAD_DIM 128 +#define TURBO_BLOCKS_PER_CHUNK (TURBO_HEAD_DIM / 32) // 4 blocks of 32 = 128 + +static void turbo_fwht_f32(float *x, int n) { + // Butterfly sums + for (int h = 1; h < n; h *= 2) { + for (int i = 0; i < n; i += h * 2) { + for (int j = i; j < i + h; j++) { + float a = x[j]; + float b = x[j + h]; + x[j] = a + b; + x[j + h] = a - b; + } + } + } + // Normalize by 1/sqrt(n) + float scale = 1.0f / sqrtf((float)n); + for (int i = 0; i < n; i++) { + x[i] *= scale; + } +} + +// ============================================================ +// TurboQuant TURBO3_0 (3-bit, 3.5 bpw) +// ============================================================ + +void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT src, block_turbo3_0 * GGML_RESTRICT dst, int64_t k) { + assert(k % TURBO3_BLOCK_SIZE == 0); + assert(k % TURBO_HEAD_DIM == 0); + + float tmp[TURBO_HEAD_DIM]; + int64_t blocks_done = 0; + + for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) { + // Step 1: Compute L2 norm of this head_dim chunk + float sum_sq = 0.0f; + for (int i = 0; i < TURBO_HEAD_DIM; i++) { + sum_sq += src[offset + i] * src[offset + i]; + } + float norm = sqrtf(sum_sq); + + // Step 2: Normalize the chunk + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + for (int i = 0; i < TURBO_HEAD_DIM; i++) { + tmp[i] = src[offset + i] * inv_norm; + } + + // Step 3: Apply FWHT rotation + turbo_fwht_f32(tmp, TURBO_HEAD_DIM); + + // Step 4: Scalar quantize + pack, one block at a time + for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) { + uint8_t indices[32]; + for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) { + indices[i] = turbo_quantize_scalar(tmp[blk * TURBO3_BLOCK_SIZE + i], turbo_codebook_3bit, 8); + } + // Store same norm in every block of this chunk + dst[blocks_done].d = GGML_FP32_TO_FP16(norm); + turbo_pack3(indices, dst[blocks_done].qs); + blocks_done++; + } + } +} + +static void quantize_row_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) { + quantize_row_turbo3_0_ref(src, (block_turbo3_0 *)dst, k); +} + +void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % TURBO3_BLOCK_SIZE == 0); + const int64_t num_blocks = k / TURBO3_BLOCK_SIZE; + + // Pass 1: Unpack all blocks and look up centroids + for (int64_t b = 0; b < num_blocks; b++) { + uint8_t indices[32]; + turbo_unpack3(x[b].qs, indices); + for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) { + y[b * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]]; + } + } + + // Pass 2: Inverse FWHT per head_dim chunk, then scale by norm + for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) { + int chunk = TURBO_HEAD_DIM; + if (offset + chunk > k) chunk = (int)(k - offset); + + // Inverse FWHT (self-inverse with 1/sqrt(n) normalization) + turbo_fwht_f32(y + offset, chunk); + + // Read norm from the first block of this chunk + float norm = GGML_FP16_TO_FP32(x[offset / TURBO3_BLOCK_SIZE].d); + for (int i = 0; i < chunk; i++) { + y[offset + i] *= norm; + } + } +} + +// ============================================================ +// TurboQuant TURBO4_0 (4-bit, 4.5 bpw) +// ============================================================ + +void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT src, block_turbo4_0 * GGML_RESTRICT dst, int64_t k) { + assert(k % TURBO4_BLOCK_SIZE == 0); + assert(k % TURBO_HEAD_DIM == 0); + + float tmp[TURBO_HEAD_DIM]; + int64_t blocks_done = 0; + + for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) { + // Step 1: Compute L2 norm of this head_dim chunk + float sum_sq = 0.0f; + for (int i = 0; i < TURBO_HEAD_DIM; i++) { + sum_sq += src[offset + i] * src[offset + i]; + } + float norm = sqrtf(sum_sq); + + // Step 2: Normalize the chunk + float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f; + for (int i = 0; i < TURBO_HEAD_DIM; i++) { + tmp[i] = src[offset + i] * inv_norm; + } + + // Step 3: Apply FWHT rotation + turbo_fwht_f32(tmp, TURBO_HEAD_DIM); + + // Step 4: Scalar quantize + pack, one block at a time + for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) { + uint8_t indices[32]; + for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) { + indices[i] = turbo_quantize_scalar(tmp[blk * TURBO4_BLOCK_SIZE + i], turbo_codebook_4bit, 16); + } + // Store same norm in every block of this chunk + dst[blocks_done].d = GGML_FP32_TO_FP16(norm); + turbo_pack4(indices, dst[blocks_done].qs); + blocks_done++; + } + } +} + +static void quantize_row_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) { + quantize_row_turbo4_0_ref(src, (block_turbo4_0 *)dst, k); +} + +void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % TURBO4_BLOCK_SIZE == 0); + const int64_t num_blocks = k / TURBO4_BLOCK_SIZE; + + // Pass 1: Unpack all blocks and look up centroids + for (int64_t b = 0; b < num_blocks; b++) { + uint8_t indices[32]; + turbo_unpack4(x[b].qs, indices); + for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) { + y[b * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]]; + } + } + + // Pass 2: Inverse FWHT per head_dim chunk, then scale by norm + for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) { + int chunk = TURBO_HEAD_DIM; + if (offset + chunk > k) chunk = (int)(k - offset); + + // Inverse FWHT (self-inverse with 1/sqrt(n) normalization) + turbo_fwht_f32(y + offset, chunk); + + // Read norm from the first block of this chunk + float norm = GGML_FP16_TO_FP32(x[offset / TURBO4_BLOCK_SIZE].d); + for (int i = 0; i < chunk; i++) { + y[offset + i] *= norm; + } + } +} + +// ============================================================ +// TurboQuant vec_dot (for flash attention compatibility) +// ============================================================ + +void ggml_vec_dot_turbo3_0(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc) { + const block_turbo3_0 *x = (const block_turbo3_0 *)vx; + const float *y = (const float *)vy; + + GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc); + + // Dequantize x into temp buffer (includes inverse FWHT + norm scaling), + // then compute dot product with y. + float tmp[TURBO_HEAD_DIM]; + float sum = 0.0f; + + for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) { + int chunk = TURBO_HEAD_DIM; + if (offset + chunk > n) chunk = (int)(n - offset); + int64_t base_block = offset / TURBO3_BLOCK_SIZE; + + // Unpack + centroid lookup for this chunk + for (int blk = 0; blk < chunk / TURBO3_BLOCK_SIZE; blk++) { + uint8_t indices[32]; + turbo_unpack3(x[base_block + blk].qs, indices); + for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) { + tmp[blk * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]]; + } + } + + // Inverse FWHT + turbo_fwht_f32(tmp, chunk); + + // Scale by norm and accumulate dot product + float norm = GGML_FP16_TO_FP32(x[base_block].d); + for (int i = 0; i < chunk; i++) { + sum += tmp[i] * norm * y[offset + i]; + } + } + *s = sum; +} + +void ggml_vec_dot_turbo4_0(int n, float * GGML_RESTRICT s, size_t bs, + const void * GGML_RESTRICT vx, size_t bx, + const void * GGML_RESTRICT vy, size_t by, int nrc) { + const block_turbo4_0 *x = (const block_turbo4_0 *)vx; + const float *y = (const float *)vy; + + GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc); + + // Dequantize x into temp buffer (includes inverse FWHT + norm scaling), + // then compute dot product with y. + float tmp[TURBO_HEAD_DIM]; + float sum = 0.0f; + + for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) { + int chunk = TURBO_HEAD_DIM; + if (offset + chunk > n) chunk = (int)(n - offset); + int64_t base_block = offset / TURBO4_BLOCK_SIZE; + + // Unpack + centroid lookup for this chunk + for (int blk = 0; blk < chunk / TURBO4_BLOCK_SIZE; blk++) { + uint8_t indices[32]; + turbo_unpack4(x[base_block + blk].qs, indices); + for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) { + tmp[blk * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]]; + } + } + + // Inverse FWHT + turbo_fwht_f32(tmp, chunk); + + // Scale by norm and accumulate dot product + float norm = GGML_FP16_TO_FP32(x[base_block].d); + for (int i = 0; i < chunk; i++) { + sum += tmp[i] * norm * y[offset + i]; + } + } + *s = sum; +} + // // 2-6 bit quantization in super-blocks // @@ -5353,6 +5689,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb); } break; + case GGML_TYPE_TURBO3_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo3_0, data, nb); + } break; + case GGML_TYPE_TURBO4_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo4_0, data, nb); + } break; case GGML_TYPE_IQ1_S: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 00604f75c..7a796321c 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -24,6 +24,9 @@ GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k); + GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); @@ -51,6 +54,9 @@ GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GG GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e9b6720c0..6fcd3e261 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -904,6 +904,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_TURBO3_0] = { + .type_name = "turbo3", + .blck_size = TURBO3_BLOCK_SIZE, + .type_size = sizeof(block_turbo3_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_turbo3_0, + .from_float_ref = (ggml_from_float_t) quantize_row_turbo3_0_ref, + }, + [GGML_TYPE_TURBO4_0] = { + .type_name = "turbo4", + .blck_size = TURBO4_BLOCK_SIZE, + .type_size = sizeof(block_turbo4_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_turbo4_0, + .from_float_ref = (ggml_from_float_t) quantize_row_turbo4_0_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5f57ba9e1..33493cb70 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -132,6 +132,22 @@ llama_kv_cache::llama_kv_cache( throw std::runtime_error("failed to create ggml context for kv cache"); } + // TurboQuant requires head_dim=128 for the FWHT transform + if (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0) { + const uint32_t n_embd_head_k = hparams.n_embd_head_k(il); + if (n_embd_head_k != 128) { + LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_k, il); + throw std::runtime_error("turbo types require head_dim=128"); + } + } + if (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0) { + const uint32_t n_embd_head_v = hparams.n_embd_head_v(il); + if (n_embd_head_v != 128) { + LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_v, il); + throw std::runtime_error("turbo types require head_dim=128"); + } + } + const bool has_k = true; const bool has_v = !is_mla; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9582164b5..2f01e17ad 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -254,6 +254,10 @@ if (NOT GGML_BACKEND_DL) llama_build_and_test(test-quantize-fns.cpp) llama_build_and_test(test-quantize-perf.cpp) llama_build_and_test(test-rope.cpp) + + # TurboQuant CPU reference tests (FWHT, MSE, bitpack) + llama_build(test-turboquant.cpp) + llama_test(test-turboquant) endif() # libmtmd diff --git a/tests/test-turboquant.cpp b/tests/test-turboquant.cpp new file mode 100644 index 000000000..96320b032 --- /dev/null +++ b/tests/test-turboquant.cpp @@ -0,0 +1,217 @@ +// TurboQuant CPU reference tests: FWHT self-inverse, roundtrip MSE, bit-packing +// +// Validates that the quantize/dequantize pipeline in ggml-quants.c produces +// MSE*d values consistent with the paper (Zandieh et al., ICLR 2026). + +#include "ggml.h" + +#undef NDEBUG +#include +#include +#include +#include +#include + +static constexpr int HEAD_DIM = 128; +static constexpr int BLOCK_SIZE = 32; +static constexpr int BLOCKS_PER_CHUNK = HEAD_DIM / BLOCK_SIZE; +static constexpr int N_VECTORS = 10000; +static constexpr int N_ELEMENTS = N_VECTORS * HEAD_DIM; + +// Expected MSE*d ranges (paper: TQ3 ~0.034, TQ4 ~0.009 for d=128) +static constexpr float TQ3_MSE_D_MIN = 0.025f; +static constexpr float TQ3_MSE_D_MAX = 0.045f; +static constexpr float TQ4_MSE_D_MIN = 0.005f; +static constexpr float TQ4_MSE_D_MAX = 0.015f; + +// ============================================================ +// FWHT reference (must match ggml-quants.c) +// ============================================================ + +static void fwht_f32(float * x, int n) { + for (int h = 1; h < n; h *= 2) { + for (int i = 0; i < n; i += h * 2) { + for (int j = i; j < i + h; j++) { + float a = x[j]; + float b = x[j + h]; + x[j] = a + b; + x[j + h] = a - b; + } + } + } + float scale = 1.0f / sqrtf((float)n); + for (int i = 0; i < n; i++) { + x[i] *= scale; + } +} + +// ============================================================ +// Test 1: FWHT self-inverse (FWHT(FWHT(x)) == x) +// ============================================================ + +static int test_fwht_self_inverse(void) { + printf(" FWHT self-inverse (d=%d)... ", HEAD_DIM); + + float orig[HEAD_DIM]; + float work[HEAD_DIM]; + + for (int i = 0; i < HEAD_DIM; i++) { + orig[i] = sinf((float)(i + 1) * 0.7f) * 2.0f; + } + memcpy(work, orig, sizeof(orig)); + + fwht_f32(work, HEAD_DIM); + fwht_f32(work, HEAD_DIM); + + float max_err = 0.0f; + for (int i = 0; i < HEAD_DIM; i++) { + float err = fabsf(work[i] - orig[i]); + if (err > max_err) { max_err = err; } + } + + bool pass = max_err < 1e-5f; + printf("max_err=%.2e %s\n", max_err, pass ? "ok" : "FAILED"); + return pass ? 0 : 1; +} + +// ============================================================ +// Test 2 & 3: Roundtrip MSE for TQ3 and TQ4 +// ============================================================ + +static void generate_random_vectors(float * dst, int n_elements, unsigned int seed) { + unsigned int state = seed; + for (int i = 0; i < n_elements; i++) { + state = state * 1664525u + 1013904223u; + dst[i] = ((float)(state >> 8) / (float)(1 << 24)) * 2.0f - 1.0f; + } +} + +static int test_roundtrip_mse(ggml_type type, float mse_d_min, float mse_d_max) { + const char * name = ggml_type_name(type); + printf(" %s roundtrip MSE*d (n=%d, d=%d)... ", name, N_VECTORS, HEAD_DIM); + + const ggml_type_traits * traits = ggml_get_type_traits(type); + assert(traits->from_float_ref != nullptr); + assert(traits->to_float != nullptr); + + std::vector src(N_ELEMENTS); + std::vector dst(N_ELEMENTS); + size_t quant_size = (size_t)N_ELEMENTS / BLOCK_SIZE * ggml_type_size(type); + std::vector quant(quant_size); + + generate_random_vectors(src.data(), N_ELEMENTS, 42); + + traits->from_float_ref(src.data(), quant.data(), N_ELEMENTS); + traits->to_float(quant.data(), dst.data(), N_ELEMENTS); + + // MSE*d = E[ ||x - x̃||² / ||x||² ] (normalized reconstruction error) + double total_nmse = 0.0; + for (int v = 0; v < N_VECTORS; v++) { + double err_sq = 0.0; + double norm_sq = 0.0; + for (int i = 0; i < HEAD_DIM; i++) { + double diff = (double)src[v * HEAD_DIM + i] - (double)dst[v * HEAD_DIM + i]; + err_sq += diff * diff; + norm_sq += (double)src[v * HEAD_DIM + i] * (double)src[v * HEAD_DIM + i]; + } + if (norm_sq > 1e-20) { + total_nmse += err_sq / norm_sq; + } + } + float mse_d = (float)(total_nmse / N_VECTORS); + + bool pass = mse_d >= mse_d_min && mse_d <= mse_d_max; + printf("MSE*d=%.4f [%.3f..%.3f] %s\n", mse_d, mse_d_min, mse_d_max, pass ? "ok" : "FAILED"); + return pass ? 0 : 1; +} + +// ============================================================ +// Test 4: Bit-pack determinism and sanity +// ============================================================ + +static int test_bitpack_deterministic(ggml_type type) { + const char * name = ggml_type_name(type); + printf(" %s pack determinism... ", name); + + const ggml_type_traits * traits = ggml_get_type_traits(type); + + float src[HEAD_DIM]; + for (int i = 0; i < HEAD_DIM; i++) { + src[i] = cosf((float)i * 0.31415f); + } + + size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type); + std::vector q1(qsize); + std::vector q2(qsize); + + traits->from_float_ref(src, q1.data(), HEAD_DIM); + traits->from_float_ref(src, q2.data(), HEAD_DIM); + + bool pass = memcmp(q1.data(), q2.data(), qsize) == 0; + printf("%s\n", pass ? "ok" : "FAILED (non-deterministic)"); + return pass ? 0 : 1; +} + +static int test_bitpack_sanity(ggml_type type) { + const char * name = ggml_type_name(type); + printf(" %s dequantize sanity... ", name); + + const ggml_type_traits * traits = ggml_get_type_traits(type); + + float src[HEAD_DIM]; + float dst[HEAD_DIM]; + for (int i = 0; i < HEAD_DIM; i++) { + src[i] = cosf((float)i * 0.31415f); + } + + size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type); + std::vector q(qsize); + + traits->from_float_ref(src, q.data(), HEAD_DIM); + traits->to_float(q.data(), dst, HEAD_DIM); + + bool all_finite = true; + bool any_nonzero = false; + for (int i = 0; i < HEAD_DIM; i++) { + if (!std::isfinite(dst[i])) { all_finite = false; } + if (fabsf(dst[i]) > 1e-10f) { any_nonzero = true; } + } + + bool pass = all_finite && any_nonzero; + printf("finite=%s nonzero=%s %s\n", + all_finite ? "yes" : "NO", + any_nonzero ? "yes" : "NO", + pass ? "ok" : "FAILED"); + return pass ? 0 : 1; +} + +// ============================================================ +// Main +// ============================================================ + +int main(void) { + printf("TurboQuant CPU reference tests\n"); + printf("==============================\n\n"); + + int n_fail = 0; + + printf("Test 1: FWHT self-inverse\n"); + n_fail += test_fwht_self_inverse(); + + printf("\nTest 2: TQ3 roundtrip MSE\n"); + n_fail += test_roundtrip_mse(GGML_TYPE_TURBO3_0, TQ3_MSE_D_MIN, TQ3_MSE_D_MAX); + + printf("\nTest 3: TQ4 roundtrip MSE\n"); + n_fail += test_roundtrip_mse(GGML_TYPE_TURBO4_0, TQ4_MSE_D_MIN, TQ4_MSE_D_MAX); + + printf("\nTest 4: Bit-pack tests\n"); + n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO3_0); + n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO4_0); + n_fail += test_bitpack_sanity(GGML_TYPE_TURBO3_0); + n_fail += test_bitpack_sanity(GGML_TYPE_TURBO4_0); + + printf("\n==============================\n"); + printf("%d/%d tests passed\n", 7 - n_fail, 7); + + return n_fail > 0 ? 1 : 0; +} diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 0a23f6985..93e4f8730 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -483,6 +483,12 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "iq4_nl") { return GGML_TYPE_IQ4_NL; } + if (s == "turbo3") { + return GGML_TYPE_TURBO3_0; + } + if (s == "turbo4") { + return GGML_TYPE_TURBO4_0; + } return GGML_TYPE_COUNT; }