feat: TurboQuant KV-cache quantization for AMD ROCm (turbo3/turbo4)

Implements TurboQuant (Zandieh et al., ICLR 2026) KV-cache vector quantization targeting AMD RDNA 4 (gfx1201, RX 9070 XT). Algorithm: L2-normalize → FWHT(128) → Lloyd-Max scalar quantize → bitpack Decode: unpack → codebook lookup → inverse FWHT → denormalize Two new GGML types: - GGML_TYPE_TURBO3_0: 3-bit, 3.5 bpw, MSE*d=0.034 (block_size=32, 14 bytes) - GGML_TYPE_TURBO4_0: 4-bit, 4.5 bpw, MSE*d=0.009 (block_size=32, 18 bytes) Architecture (pre-dequantize strategy): - Write path: FWHT-aware set-rows kernels (128 threads, shared-mem FWHT) - Read path: bulk dequantize turbo→f16 before standard Flash Attention - Stride scaling preserves ggml_permute dim swaps (critical fix) Performance (Qwen3-14B Q4_K_M, RX 9070 XT, 16 GB VRAM): f16/f16: 1865 pp512, 54 tg128 (baseline) q8_0/q8_0: 1694 pp512, 52 tg128 turbo4/turbo4: 1813 pp512, 49 tg128 (-3% pp, -9% tg, 72% less KV VRAM) turbo3/turbo3: 1983 pp512, 49 tg128 (+6% pp, -9% tg, 78% less KV VRAM) Usage: llama-cli -fa 1 --cache-type-k turbo4 --cache-type-v turbo4 Includes 7 CPU reference tests validating FWHT self-inverse, MSE against paper values, bitpack determinism, and dequantize sanity. Requires head_dim=128 (covers most current models including Llama, Qwen, Mistral, Gemma). Guard added to KV cache init with clear error message. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 19:24:47 +02:00
parent 7c203670f8
commit bd571adc99
22 changed files with 1549 additions and 10 deletions
@@ -387,6 +387,8 @@ const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_IQ4_NL,
    GGML_TYPE_Q5_0,
    GGML_TYPE_Q5_1,
+    GGML_TYPE_TURBO3_0,
+    GGML_TYPE_TURBO4_0,
 };

 static ggml_type kv_cache_type_from_str(const std::string & s) {
@@ -428,7 +428,9 @@ extern "C" {
        // GGML_TYPE_IQ4_NL_8_8 = 38,
        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
        GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
-        GGML_TYPE_COUNT   = 41,
+        GGML_TYPE_TURBO3_0 = 41, // TurboQuant 3-bit KV-cache (3.5 bpw)
+        GGML_TYPE_TURBO4_0 = 42, // TurboQuant 4-bit KV-cache (4.5 bpw)
+        GGML_TYPE_COUNT   = 43,
    };

    // precision
@@ -205,6 +205,26 @@ typedef struct {
 } block_nvfp4;
 static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");

+// TurboQuant 3-bit KV-cache quantization (3.5 bpw)
+#define TURBO3_BLOCK_SIZE 32
+#define QK_TURBO3 32
+#define QR_TURBO3 2
+typedef struct {
+    ggml_half d;        // FP16 L2-norm
+    uint8_t qs[12];     // 32 x 3-bit packed indices
+} block_turbo3_0;
+static_assert(sizeof(block_turbo3_0) == 14, "wrong turbo3 block size");
+
+// TurboQuant 4-bit KV-cache quantization (4.5 bpw)
+#define TURBO4_BLOCK_SIZE 32
+#define QK_TURBO4 32
+#define QR_TURBO4 2
+typedef struct {
+    ggml_half d;        // FP16 L2-norm
+    uint8_t qs[16];     // 32 x 4-bit packed indices
+} block_turbo4_0;
+static_assert(sizeof(block_turbo4_0) == 18, "wrong turbo4 block size");
+
 #define QK5_0 32
 typedef struct {
    ggml_half d;           // delta
@@ -120,7 +120,9 @@ if (CUDAToolkit_FOUND)
            template-instances/fattn-vec-instance-f16-f16.cu
            template-instances/fattn-vec-instance-q4_0-q4_0.cu
            template-instances/fattn-vec-instance-q8_0-q8_0.cu
-            template-instances/fattn-vec-instance-bf16-bf16.cu)
+            template-instances/fattn-vec-instance-bf16-bf16.cu
+            template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
+            template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu)
    endif()

    ggml_add_backend_library(ggml-cuda
@@ -656,6 +656,140 @@ static void dequantize_row_nvfp4_cuda(
    const int nb = k / QK_NVFP4;
    dequantize_block_nvfp4<<<nb, 32, 0, stream>>>(vx, y, k);
 }
+// ============================================================
+// TurboQuant GPU bulk dequantize kernels (with FWHT)
+// ============================================================
+
+// Each CUDA block processes one 128-element chunk (= 4 turbo blocks).
+// 128 threads per block, one thread per element.
+// Step 1: unpack index + centroid lookup -> shared memory
+// Step 2: FWHT butterfly in shared memory (7 stages for n=128)
+// Step 3: normalize by 1/sqrt(128) and scale by stored norm
+// Step 4: write to output
+
+#define TURBO_HEAD_DIM_GPU 128
+#define TURBO_BLOCKS_PER_CHUNK_GPU (TURBO_HEAD_DIM_GPU / 32)  // 4
+
+template <typename dst_t>
+static __global__ void dequantize_block_turbo3_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    __shared__ float smem[TURBO_HEAD_DIM_GPU];
+
+    const int64_t chunk_idx = blockIdx.x;
+    const int tid = threadIdx.x;  // 0..127
+
+    const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU;
+    if (output_offset + tid >= k) return;
+
+    // Which of the 4 blocks within this chunk does this thread belong to?
+    const int local_block = tid / TURBO3_BLOCK_SIZE;    // 0..3
+    const int elem_in_block = tid % TURBO3_BLOCK_SIZE;  // 0..31
+
+    const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block;
+
+    // Unpack 3-bit index and look up centroid
+    const block_turbo3_0 * x = (const block_turbo3_0 *)vx + global_block_idx;
+    const uint8_t * qs = x->qs;
+
+    int bit_off = elem_in_block * 3;
+    int byte_idx = bit_off / 8;
+    int shift = bit_off % 8;
+    uint16_t raw = (uint16_t)qs[byte_idx] >> shift;
+    if (shift > 5 && byte_idx + 1 < 12)
+        raw |= (uint16_t)qs[byte_idx + 1] << (8 - shift);
+    uint8_t idx = (uint8_t)(raw & 0x07);
+
+    smem[tid] = dc_codebook_3bit[idx];
+    __syncthreads();
+
+    // FWHT butterfly stages (7 stages for n=128)
+    for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
+        if (tid < 64) {  // 128/2 = 64 butterflies per stage
+            int group = tid / h;
+            int pos = tid % h;
+            int i = group * h * 2 + pos;
+            float a = smem[i];
+            float b = smem[i + h];
+            smem[i]     = a + b;
+            smem[i + h] = a - b;
+        }
+        __syncthreads();
+    }
+
+    // Normalize by 1/sqrt(128) and scale by stored norm
+    const float fwht_scale = 0.08838834764831844f;  // 1/sqrt(128)
+    const block_turbo3_0 * first_block = (const block_turbo3_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU;
+    float norm = __half2float(first_block->d);
+    smem[tid] *= fwht_scale * norm;
+    __syncthreads();
+
+    // Write to output
+    y[output_offset + tid] = ggml_cuda_cast<dst_t>(smem[tid]);
+}
+
+template <typename dst_t>
+static void dequantize_row_turbo3_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0);
+    const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU);
+    dequantize_block_turbo3_0_kernel<<<num_chunks, TURBO_HEAD_DIM_GPU, 0, stream>>>(vx, y, k);
+}
+
+template <typename dst_t>
+static __global__ void dequantize_block_turbo4_0_kernel(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    __shared__ float smem[TURBO_HEAD_DIM_GPU];
+
+    const int64_t chunk_idx = blockIdx.x;
+    const int tid = threadIdx.x;  // 0..127
+
+    const int64_t output_offset = chunk_idx * TURBO_HEAD_DIM_GPU;
+    if (output_offset + tid >= k) return;
+
+    // Which of the 4 blocks within this chunk does this thread belong to?
+    const int local_block = tid / TURBO4_BLOCK_SIZE;    // 0..3
+    const int elem_in_block = tid % TURBO4_BLOCK_SIZE;  // 0..31
+
+    const int64_t global_block_idx = chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU + local_block;
+
+    // Unpack 4-bit index and look up centroid
+    const block_turbo4_0 * x = (const block_turbo4_0 *)vx + global_block_idx;
+    int pair_idx = elem_in_block / 2;
+    uint8_t packed = x->qs[pair_idx];
+    uint8_t idx = (elem_in_block & 1) ? ((packed >> 4) & 0x0F) : (packed & 0x0F);
+
+    smem[tid] = dc_codebook_4bit[idx];
+    __syncthreads();
+
+    // FWHT butterfly stages (7 stages for n=128)
+    for (int h = 1; h < TURBO_HEAD_DIM_GPU; h *= 2) {
+        if (tid < 64) {  // 128/2 = 64 butterflies per stage
+            int group = tid / h;
+            int pos = tid % h;
+            int i = group * h * 2 + pos;
+            float a = smem[i];
+            float b = smem[i + h];
+            smem[i]     = a + b;
+            smem[i + h] = a - b;
+        }
+        __syncthreads();
+    }
+
+    // Normalize by 1/sqrt(128) and scale by stored norm
+    const float fwht_scale = 0.08838834764831844f;  // 1/sqrt(128)
+    const block_turbo4_0 * first_block = (const block_turbo4_0 *)vx + chunk_idx * TURBO_BLOCKS_PER_CHUNK_GPU;
+    float norm = __half2float(first_block->d);
+    smem[tid] *= fwht_scale * norm;
+    __syncthreads();
+
+    // Write to output
+    y[output_offset + tid] = ggml_cuda_cast<dst_t>(smem[tid]);
+}
+
+template <typename dst_t>
+static void dequantize_row_turbo4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    GGML_ASSERT(k % TURBO_HEAD_DIM_GPU == 0);
+    const int num_chunks = (int)(k / TURBO_HEAD_DIM_GPU);
+    dequantize_block_turbo4_0_kernel<<<num_chunks, TURBO_HEAD_DIM_GPU, 0, stream>>>(vx, y, k);
+}
+
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
@@ -756,6 +890,10 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_NVFP4:
            return dequantize_row_nvfp4_cuda;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_row_turbo3_0_cuda;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_row_turbo4_0_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cont_cuda<float>;
        case GGML_TYPE_BF16:
@@ -809,6 +947,10 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_mxfp4_cuda;
        case GGML_TYPE_NVFP4:
            return dequantize_row_nvfp4_cuda;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_row_turbo3_0_cuda;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_row_turbo4_0_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cont_cuda<half>;
        case GGML_TYPE_BF16:
@@ -832,6 +974,10 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
        case GGML_TYPE_BF16:
            return convert_unary_cuda<nv_bfloat16>;
        default:
@@ -853,6 +999,10 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
        case GGML_TYPE_F16:
            return convert_unary_cuda<half, nv_bfloat16>;
        default:
@@ -874,6 +1024,10 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_TURBO3_0:
+            return dequantize_block_cuda<QK_TURBO3, QR_TURBO3, dequantize_turbo3_0>;
+        case GGML_TYPE_TURBO4_0:
+            return dequantize_block_cuda<QK_TURBO4, QR_TURBO4, dequantize_turbo4_0>;
        case GGML_TYPE_BF16:
            return convert_unary_cuda<nv_bfloat16, float>;
        default:
@@ -211,6 +211,93 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
    quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
 }

+// ============================================================
+// TurboQuant GPU quantize device functions
+// ============================================================
+
+// Device-side codebook references (same values as in dequantize.cuh)
+// These are declared extern to reference the __constant__ arrays from dequantize.cuh
+// Note: we re-declare small local codebooks here to avoid linkage issues.
+__device__ static const float tq_codebook_3bit_q[8] = {
+    -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
+     0.0216041461f,  0.0665854520f,  0.1181396281f,  0.1883970748f
+};
+
+__device__ static const float tq_codebook_4bit_q[16] = {
+    -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
+    -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
+     0.0112761586f,  0.0341139667f,  0.0577250301f,  0.0827738972f,
+     0.1102295202f,  0.1417455465f,  0.1807794468f,  0.2376153882f
+};
+
+static __device__ uint8_t tq_nearest_codebook(float val, const float *codebook, int n) {
+    float best_dist = fabsf(val - codebook[0]);
+    uint8_t best_idx = 0;
+    for (int i = 1; i < n; i++) {
+        float dist = fabsf(val - codebook[i]);
+        if (dist < best_dist) {
+            best_dist = dist;
+            best_idx = (uint8_t)i;
+        }
+    }
+    return best_idx;
+}
+
+static __device__ void quantize_f32_turbo3_0_block(const float * __restrict__ x, block_turbo3_0 * __restrict__ y) {
+    // Compute block norm
+    float sum_sq = 0.0f;
+    for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) {
+        sum_sq += x[j] * x[j];
+    }
+    float norm = sqrtf(sum_sq);
+    y->d = __float2half(norm);
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+
+    // Quantize NORMALIZED elements to nearest 3-bit codebook entry and pack
+    uint8_t indices[32];
+    for (int j = 0; j < TURBO3_BLOCK_SIZE; j++) {
+        indices[j] = tq_nearest_codebook(x[j] * inv_norm, tq_codebook_3bit_q, 8);
+    }
+
+    // Pack 32 x 3-bit values into 12 bytes
+    memset(y->qs, 0, 12);
+    for (int j = 0; j < 32; j++) {
+        int bit_off = j * 3;
+        int byte_idx = bit_off / 8;
+        int shift = bit_off % 8;
+        y->qs[byte_idx] |= (uint8_t)((indices[j] & 0x07) << shift);
+        if (shift > 5 && byte_idx + 1 < 12) {
+            y->qs[byte_idx + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift));
+        }
+    }
+}
+
+static __device__ void quantize_f32_turbo4_0_block(const float * __restrict__ x, block_turbo4_0 * __restrict__ y) {
+    // Compute block norm
+    float sum_sq = 0.0f;
+    for (int j = 0; j < TURBO4_BLOCK_SIZE; j++) {
+        sum_sq += x[j] * x[j];
+    }
+    float norm = sqrtf(sum_sq);
+    y->d = __float2half(norm);
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+
+    // Quantize NORMALIZED elements to nearest 4-bit codebook entry and pack
+    for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) {
+        uint8_t idx0 = tq_nearest_codebook(x[2*j]     * inv_norm, tq_codebook_4bit_q, 16);
+        uint8_t idx1 = tq_nearest_codebook(x[2*j + 1] * inv_norm, tq_codebook_4bit_q, 16);
+        y->qs[j] = (idx0 & 0x0F) | ((idx1 & 0x0F) << 4);
+    }
+}
+
+static __device__ void cpy_blck_f32_turbo3_0(const char * cxi, char * cdsti) {
+    quantize_f32_turbo3_0_block((const float *)cxi, (block_turbo3_0 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_turbo4_0(const char * cxi, char * cdsti) {
+    quantize_f32_turbo4_0_block((const float *)cxi, (block_turbo4_0 *)cdsti);
+}
+
 template<typename src_t, typename dst_t>
 static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
    *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
@@ -75,3 +75,67 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
    v.x *= d;
    v.y *= d;
 }
+
+// ============================================================
+// TurboQuant GPU dequantize device functions
+// ============================================================
+
+__device__ __constant__ static float dc_codebook_3bit[8] = {
+    -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
+     0.0216041461f,  0.0665854520f,  0.1181396281f,  0.1883970748f
+};
+
+__device__ __constant__ static float dc_codebook_4bit[16] = {
+    -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
+    -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
+     0.0112761586f,  0.0341139667f,  0.0577250301f,  0.0827738972f,
+     0.1102295202f,  0.1417455465f,  0.1807794468f,  0.2376153882f
+};
+
+static __device__ __forceinline__ void dequantize_turbo3_0(
+    const void * vx, const int64_t ib, const int iqs, float2 & v)
+{
+    const block_turbo3_0 * x = (const block_turbo3_0 *) vx + ib;
+    const uint8_t * qs = x->qs;
+
+    // Unpack two consecutive 3-bit indices
+    int elem0 = iqs * 2;
+    int elem1 = iqs * 2 + 1;
+
+    // Extract 3-bit value for elem0
+    int bit_off0 = elem0 * 3;
+    int byte0 = bit_off0 / 8;
+    int shift0 = bit_off0 % 8;
+    uint16_t raw0 = (uint16_t)qs[byte0] >> shift0;
+    if (shift0 > 5 && byte0 + 1 < 12)
+        raw0 |= (uint16_t)qs[byte0 + 1] << (8 - shift0);
+    uint8_t idx0 = (uint8_t)(raw0 & 0x07);
+
+    // Extract 3-bit value for elem1
+    int bit_off1 = elem1 * 3;
+    int byte1 = bit_off1 / 8;
+    int shift1 = bit_off1 % 8;
+    uint16_t raw1 = (uint16_t)qs[byte1] >> shift1;
+    if (shift1 > 5 && byte1 + 1 < 12)
+        raw1 |= (uint16_t)qs[byte1 + 1] << (8 - shift1);
+    uint8_t idx1 = (uint8_t)(raw1 & 0x07);
+
+    const float norm = __half2float(x->d);
+    v.x = dc_codebook_3bit[idx0] * norm;
+    v.y = dc_codebook_3bit[idx1] * norm;
+}
+
+static __device__ __forceinline__ void dequantize_turbo4_0(
+    const void * vx, const int64_t ib, const int iqs, float2 & v)
+{
+    const block_turbo4_0 * x = (const block_turbo4_0 *) vx + ib;
+
+    // 4-bit: 2 values per byte, simple nibble extraction
+    uint8_t packed = x->qs[iqs];
+    uint8_t idx0 = packed & 0x0F;
+    uint8_t idx1 = (packed >> 4) & 0x0F;
+
+    const float norm = __half2float(x->d);
+    v.x = dc_codebook_4bit[idx0] * norm;
+    v.y = dc_codebook_4bit[idx1] * norm;
+}
@@ -3,6 +3,7 @@
 #include "common.cuh"
 #include "convert.cuh"
 #include "vecdotq.cuh"
+#include "dequantize.cuh"

 #include <cstdint>

@@ -577,6 +578,59 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
    }
 }

+// ============================================================
+// TurboQuant V-cache dequantize functions for flash attention
+// ============================================================
+
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_turbo3_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    const block_turbo3_0 * x = (const block_turbo3_0 *) vx;
+
+    const int64_t ib  = i0 / QK_TURBO3;
+    const int     iqs = (int)(i0 % QK_TURBO3) / 2;
+
+    static_assert(ne % 2 == 0, "bad ne");
+    T * dst_t = (T *) dst;
+
+#pragma unroll
+    for (int l = 0; l < ne/2; ++l) {
+        float2 v;
+        dequantize_turbo3_0(vx, ib, iqs + l, v);
+        if constexpr (std::is_same_v<T, half>) {
+            dst_t[2*l + 0] = __float2half(v.x);
+            dst_t[2*l + 1] = __float2half(v.y);
+        } else {
+            dst_t[2*l + 0] = (T)v.x;
+            dst_t[2*l + 1] = (T)v.y;
+        }
+    }
+}
+
+template <typename T, int ne>
+static __device__ __forceinline__ void dequantize_V_turbo4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
+    const block_turbo4_0 * x = (const block_turbo4_0 *) vx;
+
+    const int64_t ib  = i0 / QK_TURBO4;
+    const int     iqs = (int)(i0 % QK_TURBO4) / 2;
+
+    static_assert(ne % 2 == 0, "bad ne");
+    T * dst_t = (T *) dst;
+
+#pragma unroll
+    for (int l = 0; l < ne/2; ++l) {
+        float2 v;
+        dequantize_turbo4_0(vx, ib, iqs + l, v);
+        if constexpr (std::is_same_v<T, half>) {
+            dst_t[2*l + 0] = __float2half(v.x);
+            dst_t[2*l + 1] = __float2half(v.y);
+        } else {
+            dst_t[2*l + 0] = (T)v.x;
+            dst_t[2*l + 1] = (T)v.y;
+        }
+    }
+    GGML_UNUSED(x);
+}
+
 template <ggml_type type_K, int D, int nthreads>
 constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
    if constexpr (type_K == GGML_TYPE_F16) {
@@ -593,6 +647,12 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
        return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
    } else if constexpr (type_K == GGML_TYPE_BF16) {
        return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
+    // TurboQuant K-cache: Phase 1 - dequantize to FP16 before attention
+    // (use FP16 dot product after conversion in the dispatch layer)
+    } else if constexpr (type_K == GGML_TYPE_TURBO3_0) {
+        return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
+    } else if constexpr (type_K == GGML_TYPE_TURBO4_0) {
+        return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
    } else {
        static_assert(type_K == -1, "bad type");
        return nullptr;
@@ -615,6 +675,10 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
        return dequantize_V_q8_0<T, ne>;
    } else if constexpr (type_V == GGML_TYPE_BF16) {
        return dequantize_V_bf16<float, ne>;
+    } else if constexpr (type_V == GGML_TYPE_TURBO3_0) {
+        return dequantize_V_turbo3_0<T, ne>;
+    } else if constexpr (type_V == GGML_TYPE_TURBO4_0) {
+        return dequantize_V_turbo4_0<T, ne>;
    } else {
        static_assert(type_V == -1, "bad type");
        return nullptr;
@@ -75,17 +75,20 @@ static __global__ void flash_attn_ext_vec(
 #endif // GGML_USE_HIP

    constexpr int nthreads    = ggml_cuda_fattn_vec_get_nthreads_device();
-    constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
-    constexpr int nthreads_V  = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
+    constexpr bool K_is_fp_like = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16);
+    constexpr bool V_is_fp_like = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16);
+
+    constexpr int nthreads_KQ = K_is_fp_like ? 128 / cpy_nb : nthreads_KQ_q;
+    constexpr int nthreads_V  = V_is_fp_like ? 128 / cpy_nb : nthreads_V_q;

    static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
    static_assert(WARP_SIZE % nthreads_V  == 0, "bad nthreads_V");

-    constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
+    constexpr int V_rows_per_thread = V_is_fp_like ? 2*cpy_ne : 4;
    constexpr int V_cols_per_iter   = WARP_SIZE / nthreads_V;

    constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
-    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
+    constexpr bool Q_q8_1 = !K_is_fp_like;
 #ifdef V_DOT2_F32_F16_AVAILABLE
    constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half,  V_rows_per_thread>();
 #else
@@ -598,3 +601,12 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
 EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)
+
+// TurboQuant extern declarations (homogeneous K/V only)
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+
+extern DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+extern DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
@@ -5,6 +5,7 @@
 #include "fattn-vec.cuh"
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"
+#include "convert.cuh"

 template <int DKQ, int DV, int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -273,6 +274,8 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
+
+    // TurboQuant: pre-dequantized to f16 before FA, no turbo vec cases needed
 #else
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16,  GGML_TYPE_F16)
    FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
@@ -292,6 +295,10 @@ enum best_fattn_kernel {
    BEST_FATTN_KERNEL_MMA_F16  = 400,
 };

+static bool ggml_type_is_turbo(ggml_type type) {
+    return type == GGML_TYPE_TURBO3_0 || type == GGML_TYPE_TURBO4_0;
+}
+
 static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ggml_tensor * dst) {
 #ifndef FLASH_ATTN_AVAILABLE
    GGML_UNUSED(device); GGML_UNUSED(dst);
@@ -353,8 +360,13 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

 #ifndef GGML_CUDA_FA_ALL_QUANTS
-    if (K->type != V->type) {
-        return BEST_FATTN_KERNEL_NONE;
+    {
+        // Turbo types are pre-dequantized to f16, so treat them as f16 for type matching
+        const ggml_type eff_k = ggml_type_is_turbo(K->type) ? GGML_TYPE_F16 : K->type;
+        const ggml_type eff_v = ggml_type_is_turbo(V->type) ? GGML_TYPE_F16 : V->type;
+        if (eff_k != eff_v) {
+            return BEST_FATTN_KERNEL_NONE;
+        }
    }
 #endif // GGML_CUDA_FA_ALL_QUANTS

@@ -372,6 +384,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_BF16:
            break;
+        case GGML_TYPE_TURBO3_0:
+        case GGML_TYPE_TURBO4_0:
+            // Turbo types are handled via pre-dequantize to f16 before FA
+            break;
        default:
            return BEST_FATTN_KERNEL_NONE;
    }
@@ -485,8 +501,72 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    return BEST_FATTN_KERNEL_TILE;
 }

+// Pre-dequantize a turbo tensor to f16, returning a stack-allocated tensor copy.
+// The caller must keep pool_buf alive until after FA completes.
+static ggml_tensor turbo_pre_dequantize(
+        const ggml_tensor * src,
+        ggml_cuda_pool_alloc<half> & pool_buf,
+        cudaStream_t stream) {
+    const int64_t n_elements = ggml_nelements(src);
+
+    pool_buf.alloc(n_elements);
+
+    to_fp16_cuda_t dequant = ggml_get_to_fp16_cuda(src->type);
+    GGML_ASSERT(dequant != nullptr);
+    dequant(src->data, pool_buf.ptr, n_elements, stream);
+
+    // Scale existing strides from turbo block layout to f16 element layout.
+    // This preserves any permutation (e.g. ggml_permute swapping dims 1 and 2).
+    // The dequantized f16 data is in the same physical order as the turbo data,
+    // so the stride relationships must be preserved, just rescaled.
+    const size_t bs = ggml_blck_size(src->type);
+    const size_t ts = ggml_type_size(src->type);
+
+    ggml_tensor tmp = *src;
+    tmp.type = GGML_TYPE_F16;
+    tmp.data = pool_buf.ptr;
+    tmp.nb[0] = sizeof(half);
+    tmp.nb[1] = src->nb[1] * bs * sizeof(half) / ts;
+    tmp.nb[2] = src->nb[2] * bs * sizeof(half) / ts;
+    tmp.nb[3] = src->nb[3] * bs * sizeof(half) / ts;
+    tmp.view_src  = nullptr;
+    tmp.view_offs = 0;
+
+    return tmp;
+}
+
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_set_device(ctx.device);
+
+    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * V = dst->src[2];
+
+    const bool k_is_turbo = ggml_type_is_turbo(K->type);
+    const bool v_is_turbo = V && ggml_type_is_turbo(V->type);
+
+    // Pre-dequantize turbo KV to f16 so standard FA kernels can handle them.
+    // Pool buffers must outlive the FA dispatch (RAII frees on scope exit).
+    ggml_cuda_pool_alloc<half> k_pool(ctx.pool());
+    ggml_cuda_pool_alloc<half> v_pool(ctx.pool());
+    ggml_tensor k_f16, v_f16;
+
+    cudaStream_t stream = ctx.stream();
+
+    // Save original src pointers
+    ggml_tensor * orig_k = dst->src[1];
+    ggml_tensor * orig_v = dst->src[2];
+
+    if (k_is_turbo) {
+        k_f16 = turbo_pre_dequantize(K, k_pool, stream);
+        dst->src[1] = &k_f16;
+    }
+    if (v_is_turbo) {
+        v_f16 = turbo_pre_dequantize(V, v_pool, stream);
+        dst->src[2] = &v_f16;
+    }
+
+
+    // Standard FA dispatch — now sees f16 tensors
    switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
        case BEST_FATTN_KERNEL_NONE:
            GGML_ABORT("fatal error");
@@ -503,6 +583,10 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
            ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
            break;
    }
+
+    // Restore original src pointers
+    dst->src[1] = orig_k;
+    dst->src[2] = orig_v;
 }

 bool ggml_cuda_flash_attn_ext_supported(int device, const ggml_tensor * dst) {
@@ -4842,7 +4842,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
                       op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
-                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
+                       op->type == GGML_TYPE_TURBO3_0 || op->type == GGML_TYPE_TURBO4_0) &&
                       op->src[0]->type == GGML_TYPE_F32 &&
                       (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
            } break;
@@ -1,8 +1,16 @@
 #include "set-rows.cuh"
 #include "cpy-utils.cuh"

+#include <cmath>
+
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);

+// ============================================================
+// TurboQuant constants for set-rows FWHT
+// ============================================================
+#define TURBO_HEAD_DIM_SR 128
+#define TURBO_BLOCKS_PER_CHUNK_SR (TURBO_HEAD_DIM_SR / 32)  // 4
+
 // Generic quantized set_rows kernel template
 template <typename idx_t, typename block_type, int qk, void (*quantize_func)(const float *, block_type *)>
 static __global__ void k_set_rows_quant(const float * __restrict__ src0,
@@ -109,6 +117,398 @@ static void set_rows_cuda_quant(
    }
 }

+// ============================================================
+// TurboQuant specialized set-rows kernel with FWHT
+// ============================================================
+// Each CUDA block processes one 128-element chunk.
+// 128 threads per block, one thread per element in the chunk.
+// Steps:
+//   1. Each thread reads one float from the source row
+//   2. Cooperative norm computation via shared memory reduction
+//   3. Normalize the chunk
+//   4. FWHT butterfly in shared memory (7 stages for n=128)
+//   5. Each thread scalar-quantizes its element and packs into blocks
+
+// Device-side codebook references for turbo quantize (same as in cpy-utils.cuh)
+__device__ static const float sr_codebook_3bit[8] = {
+    -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
+     0.0216041461f,  0.0665854520f,  0.1181396281f,  0.1883970748f
+};
+
+__device__ static const float sr_codebook_4bit[16] = {
+    -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
+    -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
+     0.0112761586f,  0.0341139667f,  0.0577250301f,  0.0827738972f,
+     0.1102295202f,  0.1417455465f,  0.1807794468f,  0.2376153882f
+};
+
+static __device__ uint8_t sr_nearest_codebook(float val, const float *codebook, int n) {
+    float best_dist = fabsf(val - codebook[0]);
+    uint8_t best_idx = 0;
+    for (int i = 1; i < n; i++) {
+        float dist = fabsf(val - codebook[i]);
+        if (dist < best_dist) {
+            best_dist = dist;
+            best_idx = (uint8_t)i;
+        }
+    }
+    return best_idx;
+}
+
+// Turbo3 set-rows kernel: processes 128-element chunks with FWHT
+template <typename idx_t>
+static __global__ void k_set_rows_turbo3(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo3_0 * __restrict__ dst,
+        const int64_t ne_total_chunks,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3,
+        const int64_t ne00,
+        const uint3   ne00_fd,
+        const uint3   ne01_fd,
+        const uint3   ne02_fd,
+        const uint3   ne11_fd,
+        const uint3   ne12_fd) {
+
+    __shared__ float smem[TURBO_HEAD_DIM_SR];
+    __shared__ float reduction[TURBO_HEAD_DIM_SR];
+
+    const int64_t chunk_global = blockIdx.x;
+    const int tid = threadIdx.x;  // 0..127
+
+    if (chunk_global >= ne_total_chunks) return;
+
+    // Map the global chunk index to i00 (element offset within a row) + row indices
+    // Each chunk covers 128 elements, so the chunk's base element = chunk_global * 128
+    const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR;
+    uint32_t tmp = (uint32_t)elem_base;
+    uint2 div_mod;
+
+    div_mod = fast_div_modulo(tmp, ne00_fd);
+    const int64_t i00 = div_mod.y;  // offset within row (multiple of 128)
+    tmp = div_mod.x;
+
+    div_mod = fast_div_modulo(tmp, ne01_fd);
+    const int64_t i01 = div_mod.y;
+    tmp = div_mod.x;
+
+    div_mod = fast_div_modulo(tmp, ne02_fd);
+    const int64_t i02 = div_mod.y;
+    const int64_t i03 = div_mod.x;
+
+    const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd);
+    const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd);
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+
+    const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    const float val = src0_row[i00 + tid];
+    smem[tid] = val;
+
+    // Step 1: Compute L2 norm via parallel reduction
+    reduction[tid] = val * val;
+    __syncthreads();
+
+    for (int s = 64; s > 0; s >>= 1) {
+        if (tid < s) {
+            reduction[tid] += reduction[tid + s];
+        }
+        __syncthreads();
+    }
+
+    float norm = sqrtf(reduction[0]);
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+
+    // Step 2: Normalize
+    smem[tid] *= inv_norm;
+    __syncthreads();
+
+    // Step 3: FWHT butterfly stages (7 stages for n=128)
+    for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
+        if (tid < 64) {
+            int group = tid / h;
+            int pos = tid % h;
+            int i = group * h * 2 + pos;
+            float a = smem[i];
+            float b = smem[i + h];
+            smem[i]     = a + b;
+            smem[i + h] = a - b;
+        }
+        __syncthreads();
+    }
+
+    // Apply 1/sqrt(128) normalization
+    const float fwht_scale = 0.08838834764831844f;
+    smem[tid] *= fwht_scale;
+    __syncthreads();
+
+    // Step 4: Scalar quantize and pack into turbo3 blocks
+    // Each thread quantizes its element
+    uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_3bit, 8);
+
+    // We need to pack 32 indices per block cooperatively
+    // Use shared memory to collect indices, then pack
+    // Reuse reduction[] as uint8 storage
+    ((uint8_t *)reduction)[tid] = my_idx;
+    __syncthreads();
+
+    // Compute destination block pointer
+    // dst layout: dst_row*s1 + i02*s2 + i03*s3 gives byte offset to row start
+    // Then add block offset for i00
+    block_turbo3_0 * dst_row_ptr = (block_turbo3_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+    const int64_t dst_block_base = i00 / TURBO3_BLOCK_SIZE;
+
+    // Only 4 threads (one per block) do the packing
+    if (tid < TURBO_BLOCKS_PER_CHUNK_SR) {
+        const int blk = tid;
+        block_turbo3_0 * dst_block = dst_row_ptr + dst_block_base + blk;
+        const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32;
+
+        // Store norm
+        dst_block->d = __float2half(norm);
+
+        // Pack 32 x 3-bit indices into 12 bytes
+        memset(dst_block->qs, 0, 12);
+        for (int j = 0; j < 32; j++) {
+            int bit_off = j * 3;
+            int byte_pos = bit_off / 8;
+            int shift = bit_off % 8;
+            dst_block->qs[byte_pos] |= (uint8_t)((indices[j] & 0x07) << shift);
+            if (shift > 5 && byte_pos + 1 < 12) {
+                dst_block->qs[byte_pos + 1] |= (uint8_t)((indices[j] & 0x07) >> (8 - shift));
+            }
+        }
+    }
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12);
+    GGML_UNUSED(ne13);
+}
+
+// Turbo4 set-rows kernel: processes 128-element chunks with FWHT
+template <typename idx_t>
+static __global__ void k_set_rows_turbo4(
+        const float * __restrict__ src0,
+        const idx_t * __restrict__ src1,
+        block_turbo4_0 * __restrict__ dst,
+        const int64_t ne_total_chunks,
+        const int64_t ne10,
+        const int64_t ne11,
+        const int64_t ne12,
+        const int64_t ne13,
+        const int64_t s01,
+        const int64_t s02,
+        const int64_t s03,
+        const int64_t s10,
+        const int64_t s11,
+        const int64_t s12,
+        const int64_t s1,
+        const int64_t s2,
+        const int64_t s3,
+        const int64_t ne00,
+        const uint3   ne00_fd,
+        const uint3   ne01_fd,
+        const uint3   ne02_fd,
+        const uint3   ne11_fd,
+        const uint3   ne12_fd) {
+
+    __shared__ float smem[TURBO_HEAD_DIM_SR];
+    __shared__ float reduction[TURBO_HEAD_DIM_SR];
+
+    const int64_t chunk_global = blockIdx.x;
+    const int tid = threadIdx.x;  // 0..127
+
+    if (chunk_global >= ne_total_chunks) return;
+
+    // Map the global chunk index to i00 (element offset within a row) + row indices
+    const int64_t elem_base = chunk_global * TURBO_HEAD_DIM_SR;
+    uint32_t tmp = (uint32_t)elem_base;
+    uint2 div_mod;
+
+    div_mod = fast_div_modulo(tmp, ne00_fd);
+    const int64_t i00 = div_mod.y;
+    tmp = div_mod.x;
+
+    div_mod = fast_div_modulo(tmp, ne01_fd);
+    const int64_t i01 = div_mod.y;
+    tmp = div_mod.x;
+
+    div_mod = fast_div_modulo(tmp, ne02_fd);
+    const int64_t i02 = div_mod.y;
+    const int64_t i03 = div_mod.x;
+
+    const int64_t i12 = fastmodulo((uint32_t)i03, ne12_fd);
+    const int64_t i11 = fastmodulo((uint32_t)i02, ne11_fd);
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+
+    const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    const float val = src0_row[i00 + tid];
+    smem[tid] = val;
+
+    // Step 1: Compute L2 norm via parallel reduction
+    reduction[tid] = val * val;
+    __syncthreads();
+
+    for (int s = 64; s > 0; s >>= 1) {
+        if (tid < s) {
+            reduction[tid] += reduction[tid + s];
+        }
+        __syncthreads();
+    }
+
+    float norm = sqrtf(reduction[0]);
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+
+    // Step 2: Normalize
+    smem[tid] *= inv_norm;
+    __syncthreads();
+
+    // Step 3: FWHT butterfly stages (7 stages for n=128)
+    for (int h = 1; h < TURBO_HEAD_DIM_SR; h *= 2) {
+        if (tid < 64) {
+            int group = tid / h;
+            int pos = tid % h;
+            int i = group * h * 2 + pos;
+            float a = smem[i];
+            float b = smem[i + h];
+            smem[i]     = a + b;
+            smem[i + h] = a - b;
+        }
+        __syncthreads();
+    }
+
+    // Apply 1/sqrt(128) normalization
+    const float fwht_scale = 0.08838834764831844f;
+    smem[tid] *= fwht_scale;
+    __syncthreads();
+
+    // Step 4: Scalar quantize and pack into turbo4 blocks
+    uint8_t my_idx = sr_nearest_codebook(smem[tid], sr_codebook_4bit, 16);
+
+    // Collect indices in shared memory
+    ((uint8_t *)reduction)[tid] = my_idx;
+    __syncthreads();
+
+    // Compute destination block pointer
+    block_turbo4_0 * dst_row_ptr = (block_turbo4_0 *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3);
+    const int64_t dst_block_base = i00 / TURBO4_BLOCK_SIZE;
+
+    // Only 4 threads (one per block) do the packing
+    if (tid < TURBO_BLOCKS_PER_CHUNK_SR) {
+        const int blk = tid;
+        block_turbo4_0 * dst_block = dst_row_ptr + dst_block_base + blk;
+        const uint8_t * indices = ((const uint8_t *)reduction) + blk * 32;
+
+        // Store norm
+        dst_block->d = __float2half(norm);
+
+        // Pack 32 x 4-bit indices into 16 bytes
+        for (int j = 0; j < TURBO4_BLOCK_SIZE / 2; j++) {
+            dst_block->qs[j] = (indices[2*j] & 0x0F) | ((indices[2*j + 1] & 0x0F) << 4);
+        }
+    }
+
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12);
+    GGML_UNUSED(ne13);
+}
+
+// Dispatch functions for turbo set-rows
+template<typename idx_t>
+static void set_rows_cuda_turbo3(
+        const float * src0_d, const idx_t * src1_d, block_turbo3_0 * dst_d,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+
+    GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0);
+    const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR;
+    const dim3 grid_size((int)ne_total_chunks);
+    const dim3 block_size(TURBO_HEAD_DIM_SR);
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(idx_t);
+    const int64_t s11 = nb11/sizeof(idx_t);
+    const int64_t s12 = nb12/sizeof(idx_t);
+    const int64_t s1  = nb1;
+    const int64_t s2  = nb2;
+    const int64_t s3  = nb3;
+
+    if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
+        const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
+        const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
+        const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
+        const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
+        const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
+
+        k_set_rows_turbo3<idx_t><<<grid_size, block_size, 0, stream>>>(
+            src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13,
+            s01, s02, s03, s10, s11, s12, s1, s2, s3,
+            ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd);
+    }
+}
+
+template<typename idx_t>
+static void set_rows_cuda_turbo4(
+        const float * src0_d, const idx_t * src1_d, block_turbo4_0 * dst_d,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+
+    GGML_ASSERT(ne00 % TURBO_HEAD_DIM_SR == 0);
+    const int64_t ne_total_chunks = (ne00 * ne01 * ne02 * ne03) / TURBO_HEAD_DIM_SR;
+    const dim3 grid_size((int)ne_total_chunks);
+    const dim3 block_size(TURBO_HEAD_DIM_SR);
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(idx_t);
+    const int64_t s11 = nb11/sizeof(idx_t);
+    const int64_t s12 = nb12/sizeof(idx_t);
+    const int64_t s1  = nb1;
+    const int64_t s2  = nb2;
+    const int64_t s3  = nb3;
+
+    if (ne_total_chunks > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
+        const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
+        const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
+        const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
+        const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
+        const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
+
+        k_set_rows_turbo4<idx_t><<<grid_size, block_size, 0, stream>>>(
+            src0_d, src1_d, dst_d, ne_total_chunks, ne10, ne11, ne12, ne13,
+            s01, s02, s03, s10, s11, s12, s1, s2, s3,
+            ne00, ne00_fd, ne01_fd, ne02_fd, ne11_fd, ne12_fd);
+    }
+}
+
 template <typename src_t, typename idx_t, typename dst_t>
 static __global__ void k_set_rows(const src_t * __restrict__ src0,
                                  const idx_t * __restrict__ src1,
@@ -309,6 +709,28 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
            nb1, nb2, nb3,
            stream
        );
+    } else if (dst->type == GGML_TYPE_TURBO3_0) {
+        // FWHT-aware 128-thread kernels for correct TurboQuant encoding
+        set_rows_cuda_turbo3<idx_t>(
+            src0_d, src1_d, (block_turbo3_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_TURBO4_0) {
+        // FWHT-aware 128-thread kernels for correct TurboQuant encoding
+        set_rows_cuda_turbo4<idx_t>(
+            src0_d, src1_d, (block_turbo4_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
    } else {
        GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
    }
@@ -0,0 +1,7 @@
+// TurboQuant flash attention template instances
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO3_0, GGML_TYPE_TURBO3_0);
@@ -0,0 +1,7 @@
+// TurboQuant flash attention template instances
+
+#include "../fattn-vec.cuh"
+
+DECL_FATTN_VEC_CASE( 64, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(128, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
+DECL_FATTN_VEC_CASE(256, GGML_TYPE_TURBO4_0, GGML_TYPE_TURBO4_0);
@@ -75,7 +75,9 @@ else()
        ../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu
        ../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu
        ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu
-        ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu)
+        ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo3_0-turbo3_0.cu
+        ../ggml-cuda/template-instances/fattn-vec-instance-turbo4_0-turbo4_0.cu)
 endif()

 ggml_add_backend_library(ggml-hip
@@ -494,6 +494,342 @@ void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_REST
    }
 }

+// ============================================================
+// TurboQuant codebook data (Lloyd-Max optimal for d=128 after WHT)
+// ============================================================
+
+static const float turbo_codebook_3bit[8] = {
+    -0.1883972972f, -0.1181399059f, -0.0665857641f, -0.0216044751f,
+     0.0216041461f,  0.0665854520f,  0.1181396281f,  0.1883970748f
+};
+
+static const float turbo_codebook_4bit[16] = {
+    -0.2376389871f, -0.1808080141f, -0.1417777640f, -0.1102646123f,
+    -0.0828112376f, -0.0577640422f, -0.0341540905f, -0.0113168380f,
+     0.0112761586f,  0.0341139667f,  0.0577250301f,  0.0827738972f,
+     0.1102295202f,  0.1417455465f,  0.1807794468f,  0.2376153882f
+};
+
+// ============================================================
+// TurboQuant helper: pack/unpack bit-packed indices
+// ============================================================
+
+static void turbo_pack3(const uint8_t *indices, uint8_t *out) {
+    // Pack 32 x 3-bit values into 12 bytes (96 bits)
+    memset(out, 0, 12);
+    for (int i = 0; i < 32; i++) {
+        int bit_off = i * 3;
+        int byte_idx = bit_off / 8;
+        int shift = bit_off % 8;
+        out[byte_idx] |= (uint8_t)((indices[i] & 0x07) << shift);
+        if (shift > 5 && byte_idx + 1 < 12) {
+            out[byte_idx + 1] |= (uint8_t)((indices[i] & 0x07) >> (8 - shift));
+        }
+    }
+}
+
+static void turbo_unpack3(const uint8_t *packed, uint8_t *indices) {
+    // Unpack 12 bytes into 32 x 3-bit values
+    for (int i = 0; i < 32; i++) {
+        int bit_off = i * 3;
+        int byte_idx = bit_off / 8;
+        int shift = bit_off % 8;
+        uint16_t raw = (uint16_t)packed[byte_idx] >> shift;
+        if (shift > 5 && byte_idx + 1 < 12) {
+            raw |= (uint16_t)packed[byte_idx + 1] << (8 - shift);
+        }
+        indices[i] = (uint8_t)(raw & 0x07);
+    }
+}
+
+static void turbo_pack4(const uint8_t *indices, uint8_t *out) {
+    // Pack 32 x 4-bit values into 16 bytes
+    for (int i = 0; i < 16; i++) {
+        out[i] = (indices[2*i] & 0x0F) | ((indices[2*i + 1] & 0x0F) << 4);
+    }
+}
+
+static void turbo_unpack4(const uint8_t *packed, uint8_t *indices) {
+    // Unpack 16 bytes into 32 x 4-bit values
+    for (int i = 0; i < 16; i++) {
+        indices[2*i]     = packed[i] & 0x0F;
+        indices[2*i + 1] = (packed[i] >> 4) & 0x0F;
+    }
+}
+
+static uint8_t turbo_quantize_scalar(float val, const float *codebook, int n_codes) {
+    // Find nearest codebook entry (linear scan - codebook is sorted)
+    float best_dist = fabsf(val - codebook[0]);
+    uint8_t best_idx = 0;
+    for (int i = 1; i < n_codes; i++) {
+        float dist = fabsf(val - codebook[i]);
+        if (dist < best_dist) {
+            best_dist = dist;
+            best_idx = (uint8_t)i;
+        }
+    }
+    return best_idx;
+}
+
+// ============================================================
+// TurboQuant FWHT (Fast Walsh-Hadamard Transform)
+// Self-inverse with 1/sqrt(n) normalization.
+// ============================================================
+
+#define TURBO_HEAD_DIM 128
+#define TURBO_BLOCKS_PER_CHUNK (TURBO_HEAD_DIM / 32)  // 4 blocks of 32 = 128
+
+static void turbo_fwht_f32(float *x, int n) {
+    // Butterfly sums
+    for (int h = 1; h < n; h *= 2) {
+        for (int i = 0; i < n; i += h * 2) {
+            for (int j = i; j < i + h; j++) {
+                float a = x[j];
+                float b = x[j + h];
+                x[j]     = a + b;
+                x[j + h] = a - b;
+            }
+        }
+    }
+    // Normalize by 1/sqrt(n)
+    float scale = 1.0f / sqrtf((float)n);
+    for (int i = 0; i < n; i++) {
+        x[i] *= scale;
+    }
+}
+
+// ============================================================
+// TurboQuant TURBO3_0 (3-bit, 3.5 bpw)
+// ============================================================
+
+void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT src, block_turbo3_0 * GGML_RESTRICT dst, int64_t k) {
+    assert(k % TURBO3_BLOCK_SIZE == 0);
+    assert(k % TURBO_HEAD_DIM == 0);
+
+    float tmp[TURBO_HEAD_DIM];
+    int64_t blocks_done = 0;
+
+    for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
+        // Step 1: Compute L2 norm of this head_dim chunk
+        float sum_sq = 0.0f;
+        for (int i = 0; i < TURBO_HEAD_DIM; i++) {
+            sum_sq += src[offset + i] * src[offset + i];
+        }
+        float norm = sqrtf(sum_sq);
+
+        // Step 2: Normalize the chunk
+        float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+        for (int i = 0; i < TURBO_HEAD_DIM; i++) {
+            tmp[i] = src[offset + i] * inv_norm;
+        }
+
+        // Step 3: Apply FWHT rotation
+        turbo_fwht_f32(tmp, TURBO_HEAD_DIM);
+
+        // Step 4: Scalar quantize + pack, one block at a time
+        for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) {
+            uint8_t indices[32];
+            for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
+                indices[i] = turbo_quantize_scalar(tmp[blk * TURBO3_BLOCK_SIZE + i], turbo_codebook_3bit, 8);
+            }
+            // Store same norm in every block of this chunk
+            dst[blocks_done].d = GGML_FP32_TO_FP16(norm);
+            turbo_pack3(indices, dst[blocks_done].qs);
+            blocks_done++;
+        }
+    }
+}
+
+static void quantize_row_turbo3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) {
+    quantize_row_turbo3_0_ref(src, (block_turbo3_0 *)dst, k);
+}
+
+void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % TURBO3_BLOCK_SIZE == 0);
+    const int64_t num_blocks = k / TURBO3_BLOCK_SIZE;
+
+    // Pass 1: Unpack all blocks and look up centroids
+    for (int64_t b = 0; b < num_blocks; b++) {
+        uint8_t indices[32];
+        turbo_unpack3(x[b].qs, indices);
+        for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
+            y[b * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]];
+        }
+    }
+
+    // Pass 2: Inverse FWHT per head_dim chunk, then scale by norm
+    for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
+        int chunk = TURBO_HEAD_DIM;
+        if (offset + chunk > k) chunk = (int)(k - offset);
+
+        // Inverse FWHT (self-inverse with 1/sqrt(n) normalization)
+        turbo_fwht_f32(y + offset, chunk);
+
+        // Read norm from the first block of this chunk
+        float norm = GGML_FP16_TO_FP32(x[offset / TURBO3_BLOCK_SIZE].d);
+        for (int i = 0; i < chunk; i++) {
+            y[offset + i] *= norm;
+        }
+    }
+}
+
+// ============================================================
+// TurboQuant TURBO4_0 (4-bit, 4.5 bpw)
+// ============================================================
+
+void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT src, block_turbo4_0 * GGML_RESTRICT dst, int64_t k) {
+    assert(k % TURBO4_BLOCK_SIZE == 0);
+    assert(k % TURBO_HEAD_DIM == 0);
+
+    float tmp[TURBO_HEAD_DIM];
+    int64_t blocks_done = 0;
+
+    for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
+        // Step 1: Compute L2 norm of this head_dim chunk
+        float sum_sq = 0.0f;
+        for (int i = 0; i < TURBO_HEAD_DIM; i++) {
+            sum_sq += src[offset + i] * src[offset + i];
+        }
+        float norm = sqrtf(sum_sq);
+
+        // Step 2: Normalize the chunk
+        float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+        for (int i = 0; i < TURBO_HEAD_DIM; i++) {
+            tmp[i] = src[offset + i] * inv_norm;
+        }
+
+        // Step 3: Apply FWHT rotation
+        turbo_fwht_f32(tmp, TURBO_HEAD_DIM);
+
+        // Step 4: Scalar quantize + pack, one block at a time
+        for (int blk = 0; blk < TURBO_BLOCKS_PER_CHUNK; blk++) {
+            uint8_t indices[32];
+            for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
+                indices[i] = turbo_quantize_scalar(tmp[blk * TURBO4_BLOCK_SIZE + i], turbo_codebook_4bit, 16);
+            }
+            // Store same norm in every block of this chunk
+            dst[blocks_done].d = GGML_FP32_TO_FP16(norm);
+            turbo_pack4(indices, dst[blocks_done].qs);
+            blocks_done++;
+        }
+    }
+}
+
+static void quantize_row_turbo4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t k) {
+    quantize_row_turbo4_0_ref(src, (block_turbo4_0 *)dst, k);
+}
+
+void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % TURBO4_BLOCK_SIZE == 0);
+    const int64_t num_blocks = k / TURBO4_BLOCK_SIZE;
+
+    // Pass 1: Unpack all blocks and look up centroids
+    for (int64_t b = 0; b < num_blocks; b++) {
+        uint8_t indices[32];
+        turbo_unpack4(x[b].qs, indices);
+        for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
+            y[b * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]];
+        }
+    }
+
+    // Pass 2: Inverse FWHT per head_dim chunk, then scale by norm
+    for (int64_t offset = 0; offset < k; offset += TURBO_HEAD_DIM) {
+        int chunk = TURBO_HEAD_DIM;
+        if (offset + chunk > k) chunk = (int)(k - offset);
+
+        // Inverse FWHT (self-inverse with 1/sqrt(n) normalization)
+        turbo_fwht_f32(y + offset, chunk);
+
+        // Read norm from the first block of this chunk
+        float norm = GGML_FP16_TO_FP32(x[offset / TURBO4_BLOCK_SIZE].d);
+        for (int i = 0; i < chunk; i++) {
+            y[offset + i] *= norm;
+        }
+    }
+}
+
+// ============================================================
+// TurboQuant vec_dot (for flash attention compatibility)
+// ============================================================
+
+void ggml_vec_dot_turbo3_0(int n, float * GGML_RESTRICT s, size_t bs,
+                            const void * GGML_RESTRICT vx, size_t bx,
+                            const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const block_turbo3_0 *x = (const block_turbo3_0 *)vx;
+    const float *y = (const float *)vy;
+
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    // Dequantize x into temp buffer (includes inverse FWHT + norm scaling),
+    // then compute dot product with y.
+    float tmp[TURBO_HEAD_DIM];
+    float sum = 0.0f;
+
+    for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) {
+        int chunk = TURBO_HEAD_DIM;
+        if (offset + chunk > n) chunk = (int)(n - offset);
+        int64_t base_block = offset / TURBO3_BLOCK_SIZE;
+
+        // Unpack + centroid lookup for this chunk
+        for (int blk = 0; blk < chunk / TURBO3_BLOCK_SIZE; blk++) {
+            uint8_t indices[32];
+            turbo_unpack3(x[base_block + blk].qs, indices);
+            for (int i = 0; i < TURBO3_BLOCK_SIZE; i++) {
+                tmp[blk * TURBO3_BLOCK_SIZE + i] = turbo_codebook_3bit[indices[i]];
+            }
+        }
+
+        // Inverse FWHT
+        turbo_fwht_f32(tmp, chunk);
+
+        // Scale by norm and accumulate dot product
+        float norm = GGML_FP16_TO_FP32(x[base_block].d);
+        for (int i = 0; i < chunk; i++) {
+            sum += tmp[i] * norm * y[offset + i];
+        }
+    }
+    *s = sum;
+}
+
+void ggml_vec_dot_turbo4_0(int n, float * GGML_RESTRICT s, size_t bs,
+                            const void * GGML_RESTRICT vx, size_t bx,
+                            const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const block_turbo4_0 *x = (const block_turbo4_0 *)vx;
+    const float *y = (const float *)vy;
+
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+
+    // Dequantize x into temp buffer (includes inverse FWHT + norm scaling),
+    // then compute dot product with y.
+    float tmp[TURBO_HEAD_DIM];
+    float sum = 0.0f;
+
+    for (int64_t offset = 0; offset < n; offset += TURBO_HEAD_DIM) {
+        int chunk = TURBO_HEAD_DIM;
+        if (offset + chunk > n) chunk = (int)(n - offset);
+        int64_t base_block = offset / TURBO4_BLOCK_SIZE;
+
+        // Unpack + centroid lookup for this chunk
+        for (int blk = 0; blk < chunk / TURBO4_BLOCK_SIZE; blk++) {
+            uint8_t indices[32];
+            turbo_unpack4(x[base_block + blk].qs, indices);
+            for (int i = 0; i < TURBO4_BLOCK_SIZE; i++) {
+                tmp[blk * TURBO4_BLOCK_SIZE + i] = turbo_codebook_4bit[indices[i]];
+            }
+        }
+
+        // Inverse FWHT
+        turbo_fwht_f32(tmp, chunk);
+
+        // Scale by norm and accumulate dot product
+        float norm = GGML_FP16_TO_FP32(x[base_block].d);
+        for (int i = 0; i < chunk; i++) {
+            sum += tmp[i] * norm * y[offset + i];
+        }
+    }
+    *s = sum;
+}
+
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -5353,6 +5689,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
            } break;
+        case GGML_TYPE_TURBO3_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo3_0, data, nb);
+            } break;
+        case GGML_TYPE_TURBO4_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_turbo4_0, data, nb);
+            } break;
        case GGML_TYPE_IQ1_S:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
@@ -24,6 +24,9 @@ GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 *
 GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RESTRICT y, int64_t k);

+GGML_API void quantize_row_turbo3_0_ref(const float * GGML_RESTRICT x, block_turbo3_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * GGML_RESTRICT y, int64_t k);
+
 GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
 GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
@@ -51,6 +54,9 @@ GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GG
 GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

+GGML_API void dequantize_row_turbo3_0(const block_turbo3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
 GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -904,6 +904,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .type_size                = 0,
        .is_quantized             = false,
    },
+    [GGML_TYPE_TURBO3_0] = {
+        .type_name                = "turbo3",
+        .blck_size                = TURBO3_BLOCK_SIZE,
+        .type_size                = sizeof(block_turbo3_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_turbo3_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_turbo3_0_ref,
+    },
+    [GGML_TYPE_TURBO4_0] = {
+        .type_name                = "turbo4",
+        .blck_size                = TURBO4_BLOCK_SIZE,
+        .type_size                = sizeof(block_turbo4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_turbo4_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_turbo4_0_ref,
+    },
 };

 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@@ -132,6 +132,22 @@ llama_kv_cache::llama_kv_cache(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }

+        // TurboQuant requires head_dim=128 for the FWHT transform
+        if (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0) {
+            const uint32_t n_embd_head_k = hparams.n_embd_head_k(il);
+            if (n_embd_head_k != 128) {
+                LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_k, il);
+                throw std::runtime_error("turbo types require head_dim=128");
+            }
+        }
+        if (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0) {
+            const uint32_t n_embd_head_v = hparams.n_embd_head_v(il);
+            if (n_embd_head_v != 128) {
+                LLAMA_LOG_ERROR("%s: TurboQuant requires head_dim=128, got %d (layer %d)\n", __func__, n_embd_head_v, il);
+                throw std::runtime_error("turbo types require head_dim=128");
+            }
+        }
+
        const bool has_k = true;
        const bool has_v = !is_mla;

@@ -254,6 +254,10 @@ if (NOT GGML_BACKEND_DL)
    llama_build_and_test(test-quantize-fns.cpp)
    llama_build_and_test(test-quantize-perf.cpp)
    llama_build_and_test(test-rope.cpp)
+
+    # TurboQuant CPU reference tests (FWHT, MSE, bitpack)
+    llama_build(test-turboquant.cpp)
+    llama_test(test-turboquant)
 endif()

 # libmtmd
@@ -0,0 +1,217 @@
+// TurboQuant CPU reference tests: FWHT self-inverse, roundtrip MSE, bit-packing
+//
+// Validates that the quantize/dequantize pipeline in ggml-quants.c produces
+// MSE*d values consistent with the paper (Zandieh et al., ICLR 2026).
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+static constexpr int HEAD_DIM          = 128;
+static constexpr int BLOCK_SIZE        = 32;
+static constexpr int BLOCKS_PER_CHUNK  = HEAD_DIM / BLOCK_SIZE;
+static constexpr int N_VECTORS         = 10000;
+static constexpr int N_ELEMENTS        = N_VECTORS * HEAD_DIM;
+
+// Expected MSE*d ranges (paper: TQ3 ~0.034, TQ4 ~0.009 for d=128)
+static constexpr float TQ3_MSE_D_MIN = 0.025f;
+static constexpr float TQ3_MSE_D_MAX = 0.045f;
+static constexpr float TQ4_MSE_D_MIN = 0.005f;
+static constexpr float TQ4_MSE_D_MAX = 0.015f;
+
+// ============================================================
+// FWHT reference (must match ggml-quants.c)
+// ============================================================
+
+static void fwht_f32(float * x, int n) {
+    for (int h = 1; h < n; h *= 2) {
+        for (int i = 0; i < n; i += h * 2) {
+            for (int j = i; j < i + h; j++) {
+                float a = x[j];
+                float b = x[j + h];
+                x[j]     = a + b;
+                x[j + h] = a - b;
+            }
+        }
+    }
+    float scale = 1.0f / sqrtf((float)n);
+    for (int i = 0; i < n; i++) {
+        x[i] *= scale;
+    }
+}
+
+// ============================================================
+// Test 1: FWHT self-inverse (FWHT(FWHT(x)) == x)
+// ============================================================
+
+static int test_fwht_self_inverse(void) {
+    printf("  FWHT self-inverse (d=%d)... ", HEAD_DIM);
+
+    float orig[HEAD_DIM];
+    float work[HEAD_DIM];
+
+    for (int i = 0; i < HEAD_DIM; i++) {
+        orig[i] = sinf((float)(i + 1) * 0.7f) * 2.0f;
+    }
+    memcpy(work, orig, sizeof(orig));
+
+    fwht_f32(work, HEAD_DIM);
+    fwht_f32(work, HEAD_DIM);
+
+    float max_err = 0.0f;
+    for (int i = 0; i < HEAD_DIM; i++) {
+        float err = fabsf(work[i] - orig[i]);
+        if (err > max_err) { max_err = err; }
+    }
+
+    bool pass = max_err < 1e-5f;
+    printf("max_err=%.2e %s\n", max_err, pass ? "ok" : "FAILED");
+    return pass ? 0 : 1;
+}
+
+// ============================================================
+// Test 2 & 3: Roundtrip MSE for TQ3 and TQ4
+// ============================================================
+
+static void generate_random_vectors(float * dst, int n_elements, unsigned int seed) {
+    unsigned int state = seed;
+    for (int i = 0; i < n_elements; i++) {
+        state = state * 1664525u + 1013904223u;
+        dst[i] = ((float)(state >> 8) / (float)(1 << 24)) * 2.0f - 1.0f;
+    }
+}
+
+static int test_roundtrip_mse(ggml_type type, float mse_d_min, float mse_d_max) {
+    const char * name = ggml_type_name(type);
+    printf("  %s roundtrip MSE*d (n=%d, d=%d)... ", name, N_VECTORS, HEAD_DIM);
+
+    const ggml_type_traits * traits = ggml_get_type_traits(type);
+    assert(traits->from_float_ref != nullptr);
+    assert(traits->to_float != nullptr);
+
+    std::vector<float> src(N_ELEMENTS);
+    std::vector<float> dst(N_ELEMENTS);
+    size_t quant_size = (size_t)N_ELEMENTS / BLOCK_SIZE * ggml_type_size(type);
+    std::vector<uint8_t> quant(quant_size);
+
+    generate_random_vectors(src.data(), N_ELEMENTS, 42);
+
+    traits->from_float_ref(src.data(), quant.data(), N_ELEMENTS);
+    traits->to_float(quant.data(), dst.data(), N_ELEMENTS);
+
+    // MSE*d = E[ ||x - x̃||² / ||x||² ] (normalized reconstruction error)
+    double total_nmse = 0.0;
+    for (int v = 0; v < N_VECTORS; v++) {
+        double err_sq = 0.0;
+        double norm_sq = 0.0;
+        for (int i = 0; i < HEAD_DIM; i++) {
+            double diff = (double)src[v * HEAD_DIM + i] - (double)dst[v * HEAD_DIM + i];
+            err_sq += diff * diff;
+            norm_sq += (double)src[v * HEAD_DIM + i] * (double)src[v * HEAD_DIM + i];
+        }
+        if (norm_sq > 1e-20) {
+            total_nmse += err_sq / norm_sq;
+        }
+    }
+    float mse_d = (float)(total_nmse / N_VECTORS);
+
+    bool pass = mse_d >= mse_d_min && mse_d <= mse_d_max;
+    printf("MSE*d=%.4f [%.3f..%.3f] %s\n", mse_d, mse_d_min, mse_d_max, pass ? "ok" : "FAILED");
+    return pass ? 0 : 1;
+}
+
+// ============================================================
+// Test 4: Bit-pack determinism and sanity
+// ============================================================
+
+static int test_bitpack_deterministic(ggml_type type) {
+    const char * name = ggml_type_name(type);
+    printf("  %s pack determinism... ", name);
+
+    const ggml_type_traits * traits = ggml_get_type_traits(type);
+
+    float src[HEAD_DIM];
+    for (int i = 0; i < HEAD_DIM; i++) {
+        src[i] = cosf((float)i * 0.31415f);
+    }
+
+    size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type);
+    std::vector<uint8_t> q1(qsize);
+    std::vector<uint8_t> q2(qsize);
+
+    traits->from_float_ref(src, q1.data(), HEAD_DIM);
+    traits->from_float_ref(src, q2.data(), HEAD_DIM);
+
+    bool pass = memcmp(q1.data(), q2.data(), qsize) == 0;
+    printf("%s\n", pass ? "ok" : "FAILED (non-deterministic)");
+    return pass ? 0 : 1;
+}
+
+static int test_bitpack_sanity(ggml_type type) {
+    const char * name = ggml_type_name(type);
+    printf("  %s dequantize sanity... ", name);
+
+    const ggml_type_traits * traits = ggml_get_type_traits(type);
+
+    float src[HEAD_DIM];
+    float dst[HEAD_DIM];
+    for (int i = 0; i < HEAD_DIM; i++) {
+        src[i] = cosf((float)i * 0.31415f);
+    }
+
+    size_t qsize = BLOCKS_PER_CHUNK * ggml_type_size(type);
+    std::vector<uint8_t> q(qsize);
+
+    traits->from_float_ref(src, q.data(), HEAD_DIM);
+    traits->to_float(q.data(), dst, HEAD_DIM);
+
+    bool all_finite = true;
+    bool any_nonzero = false;
+    for (int i = 0; i < HEAD_DIM; i++) {
+        if (!std::isfinite(dst[i])) { all_finite = false; }
+        if (fabsf(dst[i]) > 1e-10f) { any_nonzero = true; }
+    }
+
+    bool pass = all_finite && any_nonzero;
+    printf("finite=%s nonzero=%s %s\n",
+           all_finite ? "yes" : "NO",
+           any_nonzero ? "yes" : "NO",
+           pass ? "ok" : "FAILED");
+    return pass ? 0 : 1;
+}
+
+// ============================================================
+// Main
+// ============================================================
+
+int main(void) {
+    printf("TurboQuant CPU reference tests\n");
+    printf("==============================\n\n");
+
+    int n_fail = 0;
+
+    printf("Test 1: FWHT self-inverse\n");
+    n_fail += test_fwht_self_inverse();
+
+    printf("\nTest 2: TQ3 roundtrip MSE\n");
+    n_fail += test_roundtrip_mse(GGML_TYPE_TURBO3_0, TQ3_MSE_D_MIN, TQ3_MSE_D_MAX);
+
+    printf("\nTest 3: TQ4 roundtrip MSE\n");
+    n_fail += test_roundtrip_mse(GGML_TYPE_TURBO4_0, TQ4_MSE_D_MIN, TQ4_MSE_D_MAX);
+
+    printf("\nTest 4: Bit-pack tests\n");
+    n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO3_0);
+    n_fail += test_bitpack_deterministic(GGML_TYPE_TURBO4_0);
+    n_fail += test_bitpack_sanity(GGML_TYPE_TURBO3_0);
+    n_fail += test_bitpack_sanity(GGML_TYPE_TURBO4_0);
+
+    printf("\n==============================\n");
+    printf("%d/%d tests passed\n", 7 - n_fail, 7);
+
+    return n_fail > 0 ? 1 : 0;
+}
@@ -483,6 +483,12 @@ static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "iq4_nl") {
        return GGML_TYPE_IQ4_NL;
    }
+    if (s == "turbo3") {
+        return GGML_TYPE_TURBO3_0;
+    }
+    if (s == "turbo4") {
+        return GGML_TYPE_TURBO4_0;
+    }

    return GGML_TYPE_COUNT;
 }