Compare commits

...

7 Commits

Author SHA1 Message Date
Xuan-Son Nguyen 9a96352729 test: fix IMROPE perf test case (#19465) 2026-02-10 14:37:50 +01:00
Alberto Cabrera Pérez c03a5a46f0 ggml-cpu: arm64: q6_K repack gemm and gemv (and generic) implementations (dotprod) (#19360)
* First working version of GEMM and GEMV

* interleave loads and compute

* Clang-format

* Added missing fallback. Removed tested TODO.

* Swap M and N to be consistent with the repack template convention
2026-02-10 10:47:45 +00:00
k4ss4n 6948adc90d ggml : use noexcept overload for is_regular_file in backend registration (#19452)
using noexcept std::filesystem::directory_entry::is_regular_file
overload prevents abnormal termination upon throwing an error
(as caused by symlinks to non-existent folders on linux)

Resolves: #18560
2026-02-10 10:57:48 +01:00
Piotr Wilkin (ilintar) 854b09f0d7 convert : move experts permutation from Qwen2MoeModel to Qwen3VLMoeTextModel (#19445)
* Add special case for Qwen3VLMoe

* Fix down path, remove arrows and checkmarks

* ws

* Moved to Qwen3VL

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-02-10 09:01:37 +01:00
Daniel Bevenius 66d403c480 tts : fix typos in README.md [no ci] (#19463) 2026-02-10 07:30:41 +01:00
Raul Torres f0bfe54f55 CANN: Remove unnecessary wrapper for gml_backend_buft_is_cann (#18968) 2026-02-10 14:19:30 +08:00
hipudding 52e38faf8c CANN: implement quantized MUL_MAT_ID for MoE models (#19228)
Implement ggml_cann_mul_mat_id_quant function to support quantized matrix
multiplication for Mixture of Experts (MoE) architectures on CANN backend.

Key features:
- Support Q4_0 and Q8_0 quantized weight formats
- Use IndexSelect to dynamically route expert-specific weights based on indices
- Leverage WeightQuantBatchMatmulV2 for efficient quantized computation
- Handle automatic F16 type conversion for hardware compatibility
- Support both per-expert and broadcast input modes

Implementation details:
- Extract expert weights and scales using CANN IndexSelect operation
- Process each batch and expert combination independently
- Create proper tensor views with correct stride for matmul operations
- Automatic input/output type casting to/from F16 as needed

Testing: All test cases passed for supported types (F32, F16, Q4_0, Q8_0).
2026-02-10 14:18:59 +08:00
10 changed files with 921 additions and 380 deletions
+43 -22
View File
@@ -4109,37 +4109,29 @@ class Qwen2MoeModel(TextModel):
# Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
mapped = f"{name}.weight" if not name.endswith(".weight") else name
# Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
# Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
# Need PyTorch: (128, 2048, 768) [reversed of GGML]
# So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
permuted = data_torch.permute(0, 2, 1).contiguous()
yield from super().modify_tensors(permuted, mapped, bid)
# HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert}
yield from super().modify_tensors(data_torch, mapped, bid)
return
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0:
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
split_dim = data_torch.shape[-1] // 2
gate = data_torch[..., :split_dim].contiguous()
up = data_torch[..., split_dim:].contiguous()
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
base_name = name.removesuffix(".weight")
base = base_name.rsplit('.', 1)[0]
mapped_gate = f"{base}.gate_proj.weight"
mapped_up = f"{base}.up_proj.weight"
perm_gate = gate.permute(0, 2, 1).contiguous()
perm_up = up.permute(0, 2, 1).contiguous()
yield from super().modify_tensors(perm_gate, mapped_gate, bid)
yield from super().modify_tensors(perm_up, mapped_up, bid)
# HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2
n_ff = data_torch.shape[-2] // 2
gate = data_torch[..., :n_ff, :].contiguous()
up = data_torch[..., n_ff:, :].contiguous()
# gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert}
base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj")
mapped_gate = f"{base_name}.gate_proj.weight"
mapped_up = f"{base_name}.up_proj.weight"
yield from super().modify_tensors(gate, mapped_gate, bid)
yield from super().modify_tensors(up, mapped_up, bid)
return
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
# skip visual tensors
return
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
@@ -4535,6 +4527,35 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
if name.startswith("model.visual."):
return
# Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
name = name.replace("language_model.", "")
mapped = f"{name}.weight" if not name.endswith(".weight") else name
permuted = data_torch.permute(0, 2, 1).contiguous()
yield from ModelBase.modify_tensors(self, permuted, mapped, bid)
return
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
name = name.replace("language_model.", "")
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
split_dim = data_torch.shape[-1] // 2
gate = data_torch[..., :split_dim].contiguous()
up = data_torch[..., split_dim:].contiguous()
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
base_name = name.removesuffix(".weight")
base = base_name.rsplit('.', 1)[0]
mapped_gate = f"{base}.gate_proj.weight"
mapped_up = f"{base}.up_proj.weight"
perm_gate = gate.permute(0, 2, 1).contiguous()
perm_up = up.permute(0, 2, 1).contiguous()
yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid)
yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid)
return
yield from super().modify_tensors(data_torch, name, bid)
+3 -2
View File
@@ -471,9 +471,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
int best_score = 0;
fs::path best_path;
std::error_code ec;
for (const auto & search_path : search_paths) {
if (std::error_code ec; !fs::exists(search_path, ec)) {
if (!fs::exists(search_path, ec)) {
if (ec) {
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
} else {
@@ -483,7 +484,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
}
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
if (entry.is_regular_file(ec)) {
auto filename = entry.path().filename();
auto ext = entry.path().extension();
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
+192 -99
View File
@@ -3286,130 +3286,223 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor
}
/**
* @brief Performs expert-specific matrix multiplication (MoE) with
* quantized precision using the CANN backend.
* @brief Performs quantized matrix multiplication for Mixture of Experts (MoE)
* models using the CANN backend.
*
* This function executes a matrix multiplication operation tailored for
* Mixture of Experts (MoE) models, where the input tensor is multiplied
* with expert-specific quantized weight matrices. It leverages the CANN
* backend to perform efficient low-precision computations and stores the
* quantized result in the destination tensor `dst`.
* This function implements MUL_MAT_ID operation for quantized weight matrices
* (Q4_0 and Q8_0 formats). It selects expert-specific weight matrices based on
* the provided expert indices, and computes matrix multiplication using CANN's
* WeightQuantBatchMatmulV2 operator.
*
* Quantization techniques reduce memory footprint and improve performance
* by using lower-bit representations (e.g., int8) instead of floating-point.
* This function is designed to work with such formats and may incorporate
* optimizations like identity-based fast paths or routing masks for sparse
* expert selection.
* The function performs the following steps:
* 1. Converts input/output tensors to F16 format if necessary
* 2. Uses IndexSelect to extract expert-specific weights and scales based on indices
* 3. Performs quantized matrix multiplication for each expert using WeightQuantBatchMatmulV2
* 4. Converts output back to the target type if needed
*
* @param ctx The context for executing CANN backend operations.
* @param dst The destination tensor where the quantized MoE multiplication result
* will be stored.
* Tensor shapes:
* - dst: [M, K, N, 1] - output tensor
* - src0: [D, M, A, 1] - quantized weight matrices (Q4_0 or Q8_0)
* - src1: [D, B, N, 1] - input activations (B = K for per-expert input, or B = 1 for broadcast)
* - ids: [K, N] - expert indices for routing
*
* @note This function assumes quantized data types and is designed for
* MoE architectures with potential sparse expert routing.
* @param ctx The CANN backend context for operation execution.
* @param dst The destination tensor where the multiplication result will be stored.
*
* @note Only Q4_0 and Q8_0 quantization formats are supported.
* @note The function handles automatic type conversion to/from F16 as needed by the hardware.
*/
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
// TODO: Use aclnnGroupedMatMul
//dst [M, K, N, 1]
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
ggml_tensor * ids = dst->src[2]; //ids [K, N]
// dst: [M, K, N, 1]
// src0: [D, M, A, 1] - quantized weights
// src1: [D, B, N, 1] - input activations, B = K or B = 1
// ids: [K, N] - expert indices
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
ggml_tensor * ids = dst->src[2];
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src0->ne[3] == 1);
GGML_ASSERT(src1->ne[3] == 1);
GGML_ASSERT(dst->ne[3] == 1);
GGML_ASSERT(src1->ne[2] == ids->ne[1]);
// copy index from npu to cpu
int64_t n_as = ne02; // A
int64_t n_ids = ids->ne[0]; // K
const int64_t n_batches = ids->ne[1];
const int64_t n_select_experts = ids->ne[0];
const enum ggml_type type = src0->type;
std::vector<char> ids_host(ggml_nbytes(ids));
ACL_CHECK(aclrtMemcpyAsync(ids_host.data(), ggml_nbytes(ids), ids->data, ggml_nbytes(ids),
ACL_MEMCPY_DEVICE_TO_HOST, ctx.stream()));
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
const int32_t group_size = QK8_0; // Both Q4_0 and Q8_0 use group size of 32
GGML_ASSERT(group_size == QK4_0);
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
// Calculate element size for quantized weights
const float weight_elem_size =
(type == GGML_TYPE_Q4_0) ? 0.5f :
(type == GGML_TYPE_Q8_0) ? 1.0f :
(GGML_ABORT("MUL_MAT_ID only supports Q4_0 and Q8_0"), 0.0f);
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
// Calculate scale offset in memory
const size_t weight_size = src0->ne[0] * src0->ne[1] * src0->ne[2] * weight_elem_size;
const size_t scale_elem_size = sizeof(uint16_t);
char * scale_data = (char *) src0->data + weight_size;
const enum ggml_type type = dst->src[0]->type;
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t));
} else {
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
}
// Allocate buffers for selected expert weights and scales
const size_t selected_weight_size = src0->ne[0] * src0->ne[1] * n_select_experts * weight_elem_size;
ggml_cann_pool_alloc selected_weight_alloc(ctx.pool(), selected_weight_size);
void * selected_weight_buffer = selected_weight_alloc.get();
// src0_row [D, M, 1, 1] weight without permute
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[0] = weight_elem_size;
src0_row.nb[1] = weight_elem_size * ne00;
src0_row.nb[2] = weight_elem_size * ne00;
src0_row.nb[3] = weight_elem_size * ne00;
size_t weight_stride = ne00 * ne01 * weight_elem_size;
size_t weight_size = weight_stride * ne02 * ne03;
const size_t selected_scale_size = (src0->ne[0] / group_size) * src0->ne[1] * n_select_experts * scale_elem_size;
ggml_cann_pool_alloc selected_scale_alloc(ctx.pool(), selected_scale_size);
void * selected_scale_buffer = selected_scale_alloc.get();
// scale [D, M, 1, 1] -> scale && permute
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
// Helper lambda to allocate and cast tensor to F16 if needed
constexpr size_t f16_elem_size = sizeof(uint16_t);
auto prepare_f16_buffer = [&](ggml_tensor * tensor, ggml_cann_pool_alloc & allocator,
bool need_cast = false) -> void * {
if (tensor->type == GGML_TYPE_F16) {
return tensor->data;
}
// src1_row [D, 1, 1, 1] -> input
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
size_t total_size = f16_elem_size;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
total_size *= tensor->ne[i];
}
void * buffer = allocator.alloc(total_size);
// dst_row [M, 1, 1, 1] -> out
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
if (need_cast == false) {
return buffer;
}
//create weight for one row
ggml_cann_pool_alloc weight_allocator(ctx.pool());
void * weight_buffer = weight_allocator.alloc(nb02);
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
// expert index
int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
for (int i = 0; i < GGML_MAX_DIMS; i++) {
ne[i] = tensor->ne[i];
if (i > 0) {
nb[i] = nb[i - 1] * ne[i - 1];
}
}
// If B = 1 (broadcast), always use 0; otherwise, use id.
int64_t i11 = (ne11 == 1 ? 0 : id);
int64_t i12 = iid1;
acl_tensor_ptr src_tensor = ggml_cann_create_tensor(tensor);
acl_tensor_ptr f16_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
aclnn_cast(ctx, src_tensor.get(), f16_tensor.get(), ACL_FLOAT16);
int64_t i1 = id;
int64_t i2 = i12;
return buffer;
};
void * src0_tmp_ptr = src0_original + i02 * weight_stride;
void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12;
void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2;
// Prepare input and output buffers
ggml_cann_pool_alloc input_alloc(ctx.pool());
void * input_buffer = prepare_f16_buffer(src1, input_alloc, true);
// mem cpy
ACL_CHECK(aclrtMemcpyAsync(weight_buffer, weight_stride, src0_tmp_ptr, weight_stride,
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
void * scale_buffer = (char *) weight_buffer + weight_stride;
ACL_CHECK(aclrtMemcpyAsync(scale_buffer, scale_stride, scale_tmp_ptr, scale_stride,
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
ggml_cann_pool_alloc output_alloc(ctx.pool());
void * output_buffer = prepare_f16_buffer(dst, output_alloc, false);
src0_row.data = weight_buffer;
src1_row.data = src1_tmp_ptr;
dst_row.data = dst_tmp_ptr;
dst_row.src[0] = &src0_row;
dst_row.src[1] = &src1_row;
// Process each batch
for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
// Create index tensor for current batch
const size_t index_offset = batch_idx * ids->nb[1];
acl_tensor_ptr batch_indices = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, index_offset);
ggml_cann_mul_mat(ctx, &dst_row);
// Select quantized weights using expert indices
// Q4_0 stores 2 values per byte, Q8_0 stores 1 value per byte
const int64_t weight_d = (type == GGML_TYPE_Q4_0) ? src0->ne[0] / 2 : src0->ne[0];
const int64_t weight_m = src0->ne[1];
const int64_t weight_n_experts = src0->ne[2];
int64_t weight_ne[3] = { weight_d, weight_m, weight_n_experts };
size_t weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t), weight_d * weight_m * sizeof(int8_t) };
acl_tensor_ptr all_weights =
ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, 3);
int64_t selected_weight_ne[3] = { weight_d, weight_m, n_select_experts };
size_t selected_weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t),
weight_d * weight_m * sizeof(int8_t) };
acl_tensor_ptr selected_weights = ggml_cann_create_tensor(selected_weight_buffer, ACL_INT8, sizeof(int8_t),
selected_weight_ne, selected_weight_nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_weights.get(), 0, batch_indices.get(), selected_weights.get());
// Select scales using the same expert indices
const int64_t scale_d = src0->ne[0] / group_size;
int64_t scale_ne[3] = { scale_d, weight_m, weight_n_experts };
size_t scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size, scale_d * weight_m * scale_elem_size };
acl_tensor_ptr all_scales =
ggml_cann_create_tensor(scale_data, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 3);
int64_t selected_scale_ne[3] = { scale_d, weight_m, n_select_experts };
size_t selected_scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size,
scale_d * weight_m * scale_elem_size };
acl_tensor_ptr selected_scales = ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size,
selected_scale_ne, selected_scale_nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_scales.get(), 0, batch_indices.get(), selected_scales.get());
// Process each expert for current batch
// IndexSelect output layout: [D, M, K] in contiguous format
// WeightQuantBatchMatmulV2 expects: [M, D] with row-major stride
for (int64_t expert_idx = 0; expert_idx < n_select_experts; expert_idx++) {
// Determine input offset: broadcast if src1->ne[1]==1, otherwise use per-expert input
const size_t input_offset =
(batch_idx * src1->ne[1] + (src1->ne[1] == 1 ? 0 : expert_idx)) * src1->ne[0] * f16_elem_size;
const size_t output_offset = (batch_idx * dst->ne[1] + expert_idx) * dst->ne[0] * f16_elem_size;
// Create weight view for current expert: [D, M, K] -> [M, D]
int64_t weight_view_ne[2] = { weight_m, src0->ne[0] };
float weight_view_nb[2] = { src0->ne[0] * weight_elem_size, weight_elem_size };
const size_t weight_view_offset = expert_idx * selected_weight_nb[2];
acl_tensor_ptr weight_view =
ggml_cann_create_tensor(selected_weight_buffer, ggml_cann_type_mapping(type), weight_elem_size,
weight_view_ne, weight_view_nb, 2, ACL_FORMAT_ND, weight_view_offset);
// Create scale view for current expert: [D, M, K] -> [M, D]
int64_t scale_view_ne[2] = { weight_m, scale_d };
size_t scale_view_nb[2] = { selected_scale_nb[1], selected_scale_nb[0] };
const size_t scale_view_offset = expert_idx * selected_scale_nb[2];
acl_tensor_ptr scale_view =
ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size, scale_view_ne,
scale_view_nb, 2, ACL_FORMAT_ND, scale_view_offset);
// Create input activation tensor [D, 1]
int64_t input_ne[2] = { src1->ne[0], 1 };
size_t input_nb[2] = { f16_elem_size, src1->ne[0] * f16_elem_size };
acl_tensor_ptr input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, f16_elem_size, input_ne,
input_nb, 2, ACL_FORMAT_ND, input_offset);
// Create output tensor [M, 1]
int64_t output_ne[2] = { dst->ne[0], 1 };
size_t output_nb[2] = { f16_elem_size, dst->ne[0] * f16_elem_size };
acl_tensor_ptr output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, output_ne,
output_nb, 2, ACL_FORMAT_ND, output_offset);
// Perform quantized matrix multiplication
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, input_tensor.get(), weight_view.get(),
scale_view.get(), nullptr, nullptr, nullptr, nullptr, group_size,
output_tensor.get());
}
}
return;
// Cast output back to original type if we used a temporary F16 buffer
if (dst->type != GGML_TYPE_F16) {
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
for (int i = 0; i < GGML_MAX_DIMS; i++) {
ne[i] = dst->ne[i];
if (i > 0) {
nb[i] = nb[i - 1] * ne[i - 1];
}
}
acl_tensor_ptr f16_output =
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
acl_tensor_ptr dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, f16_output.get(), dst_tensor.get(), ggml_cann_type_mapping(dst->type));
}
}
void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+37 -52
View File
@@ -794,19 +794,44 @@ struct ggml_backend_cann_buffer_context {
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
};
// cann buffer type
/**
* @brief Check if a buffer is a CANN buffer.
*
* This function checks if a given buffer is a CANN buffer by comparing its
* `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
*
* @param buffer The buffer to check.
* @return true if the buffer is a CANN buffer, false otherwise.
* @brief Structure representing context information for a specific backend
* buffer type.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
struct ggml_backend_cann_buffer_type_context {
int32_t device; /**< Device identifier associated with the buffer context. */
std::string name; /**< Name associated with the buffer context. */
};
static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_cann(buffer->buft);
/**
* @brief Retrieves the name associated with a CANN buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN buffer type context.
*
* @param buft Pointer to the buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->name.c_str();
}
/**
* @brief Checks if the backend buffer type is associated with the CANN backend.
*
* This function checks whether the provided backend buffer type is associated
* with the CANN backend based on the comparison of its name retrieval function
* pointer.
*
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the buffer type is associated with the CANN
* backend, otherwise false.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
@@ -1271,7 +1296,7 @@ static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor * src,
ggml_tensor * dst) {
if (ggml_backend_buffer_is_cann(src->buffer)) {
if (ggml_backend_buft_is_cann(src->buffer->buft)) {
ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
@@ -1335,31 +1360,6 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .reset = */ NULL,
};
// cann buffer type
/**
* @brief Structure representing context information for a specific backend
* buffer type.
*/
struct ggml_backend_cann_buffer_type_context {
int32_t device; /**< Device identifier associated with the buffer context. */
std::string name; /**< Name associated with the buffer context. */
};
/**
* @brief Retrieves the name associated with a CANN buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN buffer type context.
*
* @param buft Pointer to the buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->name.c_str();
}
/**
* @brief Allocates a new CANN buffer of the specified type and size.
*
@@ -1997,7 +1997,7 @@ static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
if (!ggml_backend_buft_is_cann(src->buffer->buft) || !ggml_backend_buft_is_cann(dst->buffer->buft)) {
return false;
}
@@ -2523,21 +2523,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
GGML_UNUSED(dev);
}
/**
* @brief Checks if the backend buffer type is associated with the CANN backend.
*
* This function checks whether the provided backend buffer type is associated
* with the CANN backend based on the comparison of its name retrieval function
* pointer.
*
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the buffer type is associated with the CANN
* backend, otherwise false.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Records an event on the CANN backend stream.
*
+15 -1
View File
@@ -43,6 +43,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -55,7 +56,8 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
# define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -76,6 +78,7 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
@@ -84,6 +87,7 @@
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
@@ -107,6 +111,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -119,6 +124,7 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
@@ -143,6 +149,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -155,6 +162,7 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
@@ -186,6 +194,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -197,6 +206,7 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
@@ -227,6 +237,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -239,6 +250,7 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
@@ -271,6 +283,7 @@
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
@@ -283,6 +296,7 @@
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+395 -5
View File
@@ -1072,6 +1072,195 @@ void ggml_gemv_q5_K_8x8_q8_K(int n,
ggml_gemv_q5_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q6_K_8x4_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 4;
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int col_groups = ncols_interleaved / 4;
const uint8x16_t m4b = vdupq_n_u8(0x0f);
const uint8x16_t mask_lo = vdupq_n_u8(0x03);
const uint8x16_t mask_hi = vdupq_n_u8(0x30);
// 1x8 tile = 2 x 4
float32x4_t acc_f32[2];
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * GGML_RESTRICT q6_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int i = 0; i < col_groups; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
float32x4_t q6_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d)); // d0 d1 d2 d3
float32x4_t q6_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d + 4)); // d4 d5 d6 d7
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
float32x4_t sb_scale_0 = vmulq_f32(q6_d_0, q8_d);
float32x4_t sb_scale_1 = vmulq_f32(q6_d_1, q8_d);
int32x4_t acc[col_groups];
for (int i = 0; i < col_groups; i++) {
acc[i] = vdupq_n_s32(0);
}
// Load all 16 scales once and widen to int16 (Q6_K has 16 scales per block)
// Reused for bias and dequantization later
int16_t q6_scales[16 * 8];
for (int i = 0; i < 16; i++) {
int16x8_t scales = vmovl_s8(vld1_s8(q6_ptr[b].scales + i * 8));
vst1q_s16(q6_scales + i * 8, scales);
}
// Compute bias per column using q8 bsums and preloaded scales to skip the -32 shift
int32x4_t bias_lo = vdupq_n_s32(0);
int32x4_t bias_hi = vdupq_n_s32(0);
// Load bsums in chunks of 4 to process with vectorized operations
for (int i = 0; i < 16; i += 4) {
int16x4_t bsums_vec = vld1_s16(q8_ptr[b].bsums + i);
int16x4_t scales_lo_0 = vld1_s16(q6_scales + (i + 0) * 8);
int16x4_t scales_hi_0 = vld1_s16(q6_scales + (i + 0) * 8 + 4);
int16x4_t scales_lo_1 = vld1_s16(q6_scales + (i + 1) * 8);
int16x4_t scales_hi_1 = vld1_s16(q6_scales + (i + 1) * 8 + 4);
int16x4_t scales_lo_2 = vld1_s16(q6_scales + (i + 2) * 8);
int16x4_t scales_hi_2 = vld1_s16(q6_scales + (i + 2) * 8 + 4);
int16x4_t scales_lo_3 = vld1_s16(q6_scales + (i + 3) * 8);
int16x4_t scales_hi_3 = vld1_s16(q6_scales + (i + 3) * 8 + 4);
bias_lo = vmlal_lane_s16(bias_lo, scales_lo_0, bsums_vec, 0);
bias_hi = vmlal_lane_s16(bias_hi, scales_hi_0, bsums_vec, 0);
bias_lo = vmlal_lane_s16(bias_lo, scales_lo_1, bsums_vec, 1);
bias_hi = vmlal_lane_s16(bias_hi, scales_hi_1, bsums_vec, 1);
bias_lo = vmlal_lane_s16(bias_lo, scales_lo_2, bsums_vec, 2);
bias_hi = vmlal_lane_s16(bias_hi, scales_hi_2, bsums_vec, 2);
bias_lo = vmlal_lane_s16(bias_lo, scales_lo_3, bsums_vec, 3);
bias_hi = vmlal_lane_s16(bias_hi, scales_hi_3, bsums_vec, 3);
}
bias_lo = vshlq_n_s32(bias_lo, 5);
bias_hi = vshlq_n_s32(bias_hi, 5);
// Process two 128-value halves per superblock
for (int half = 0; half < 2; half++) {
const uint8_t * ql_base = q6_ptr[b].ql + half * 512;
const uint8_t * qh_base = q6_ptr[b].qh + half * 256;
// A subblock (sb) is a set of weights that share the scale
// Since q6_K scales are per 16 elements
// num sbs -> 256 elements / (16 elements/scale * 2 elements/byte * 2 halves)
for (int sb = 0; sb < QK_K / 64; sb++) {
const int8_t * q8_base_l = q8_ptr[b].qs + half * 128 + sb * 16;
const int8_t * q8_base_h = q8_base_l + 64;
// Load and duplicate q8 values (each register covers four interleaved columns of q6)
int8x16_t q8_l[4];
int8x16_t q8_h[4];
for (int i = 0; i < 4; i++) {
q8_l[i] = (int8x16_t) vld1q_dup_s32((const int32_t *) (q8_base_l + i * 4));
q8_h[i] = (int8x16_t) vld1q_dup_s32((const int32_t *) (q8_base_h + i * 4));
}
const int ql_off_base = sb * QK_K / 2;
const int qh_off_base = ql_off_base & 255; // wraps after 256 bytes
// Load 4 vectors at once (64 bytes each for ql_0, ql_1, qh_0, qh_1)
uint8x16x4_t q6_ql_0 = vld1q_u8_x4(ql_base + ql_off_base);
uint8x16x4_t q6_ql_1 = vld1q_u8_x4(ql_base + ql_off_base + 64);
uint8x16x4_t q6_qh_0 = vld1q_u8_x4(qh_base + qh_off_base);
uint8x16x4_t q6_qh_1 = vld1q_u8_x4(qh_base + qh_off_base + 64);
// Adjust qh for subblocks 2 and 3 (shift right by 2)
if (sb > 1) {
q6_qh_0.val[0] = vshrq_n_u8(q6_qh_0.val[0], 2);
q6_qh_0.val[1] = vshrq_n_u8(q6_qh_0.val[1], 2);
q6_qh_0.val[2] = vshrq_n_u8(q6_qh_0.val[2], 2);
q6_qh_0.val[3] = vshrq_n_u8(q6_qh_0.val[3], 2);
q6_qh_1.val[0] = vshrq_n_u8(q6_qh_1.val[0], 2);
q6_qh_1.val[1] = vshrq_n_u8(q6_qh_1.val[1], 2);
q6_qh_1.val[2] = vshrq_n_u8(q6_qh_1.val[2], 2);
q6_qh_1.val[3] = vshrq_n_u8(q6_qh_1.val[3], 2);
}
const uint8x16_t q6_ql[8] = { q6_ql_0.val[0], q6_ql_0.val[1], q6_ql_0.val[2], q6_ql_0.val[3],
q6_ql_1.val[0], q6_ql_1.val[1], q6_ql_1.val[2], q6_ql_1.val[3] };
const uint8x16_t q6_qh[8] = { q6_qh_0.val[0], q6_qh_0.val[1], q6_qh_0.val[2], q6_qh_0.val[3],
q6_qh_1.val[0], q6_qh_1.val[1], q6_qh_1.val[2], q6_qh_1.val[3] };
// Process column groups (0-3, 4-7)
for (int g = 0; g < col_groups; g++) {
int32x4_t sb_acc_l = vdupq_n_s32(0);
int32x4_t sb_acc_h = vdupq_n_s32(0);
for (int chunk = 0; chunk < 4; chunk++) {
const int idx = chunk * 2 + g;
const uint8x16_t q6_qs_l = q6_ql[idx];
const uint8x16_t q6_qs_h = q6_qh[idx];
// Extract high 2 bits for upper nibble reconstruction
const uint8x16_t q6_qs_hh = vandq_u8(q6_qs_h, mask_hi);
// q6 = (low4 | high2<<4), without -32 bias (handled via bsums)
const int8x16_t q6_l =
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q6_qs_l, m4b), vandq_u8(q6_qs_h, mask_lo), 4));
const int8x16_t q6_h = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_qs_l, 4), q6_qs_hh));
sb_acc_l = vdotq_s32(sb_acc_l, q6_l, q8_l[chunk]);
sb_acc_h = vdotq_s32(sb_acc_h, q6_h, q8_h[chunk]);
}
const int scale_idx_l = half * 8 + sb;
const int scale_idx_h = half * 8 + sb + 4;
const int32x4_t scale_vec_l = vmovl_s16(vld1_s16(q6_scales + scale_idx_l * 8 + g * 4));
const int32x4_t scale_vec_h = vmovl_s16(vld1_s16(q6_scales + scale_idx_h * 8 + g * 4));
acc[g] = vmlaq_s32(acc[g], sb_acc_l, scale_vec_l);
acc[g] = vmlaq_s32(acc[g], sb_acc_h, scale_vec_h);
}
}
} // for half
// Bias correction
acc[0] = vsubq_s32(acc[0], bias_lo);
acc[1] = vsubq_s32(acc[1], bias_hi);
// Apply superblock scale (no mins for q6_K)
// acc[g] has [c0, c1, c2, c3]
float32x4_t w_0123 = vmulq_f32(vcvtq_f32_s32(acc[0]), sb_scale_0);
float32x4_t w_4567 = vmulq_f32(vcvtq_f32_s32(acc[1]), sb_scale_1);
acc_f32[0] = vaddq_f32(acc_f32[0], w_0123);
acc_f32[1] = vaddq_f32(acc_f32[1], w_4567);
} // for b
int base = x * ncols_interleaved;
vst1q_f32(s + base, acc_f32[0]);
vst1q_f32(s + base + 4, acc_f32[1]);
} // for x
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemv_q6_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q6_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
@@ -1177,15 +1366,14 @@ void ggml_gemv_q6_K_8x8_q8_K(int n,
q8_h[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base_h + i * 8));
}
// TODO: Test other qh repack patterns to reduce loads
const int ql_off_base = sb * QK_K / 2;
const int qh_off_base = ql_off_base & 255; // wraps after 256 bytes
// Load 4 vectors at once (64 bytes each for ql_0, ql_1, qh_0, qh_1)
ggml_uint8x16x4_t q6_ql_0 = ggml_vld1q_u8_x4(ql_base + ql_off_base);
ggml_uint8x16x4_t q6_ql_1 = ggml_vld1q_u8_x4(ql_base + ql_off_base + 64);
ggml_uint8x16x4_t q6_qh_0 = ggml_vld1q_u8_x4(qh_base + qh_off_base);
ggml_uint8x16x4_t q6_qh_1 = ggml_vld1q_u8_x4(qh_base + qh_off_base + 64);
uint8x16x4_t q6_ql_0 = vld1q_u8_x4(ql_base + ql_off_base);
uint8x16x4_t q6_ql_1 = vld1q_u8_x4(ql_base + ql_off_base + 64);
uint8x16x4_t q6_qh_0 = vld1q_u8_x4(qh_base + qh_off_base);
uint8x16x4_t q6_qh_1 = vld1q_u8_x4(qh_base + qh_off_base + 64);
// Adjust qh for subblocks 2 and 3 (shift right by 2)
if (sb > 1) {
@@ -3474,6 +3662,208 @@ void ggml_gemm_q5_K_8x8_q8_K(int n,
ggml_gemm_q5_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q6_K_8x4_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 4;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int q8_k_blocklen = 4;
constexpr int col_groups = ncols_interleaved / 4;
constexpr int acc_size = q8_k_blocklen * col_groups; // 4 rows, 2 column groups
const uint8x16_t m4b = vdupq_n_u8(0x0f);
const uint8x16_t mask_lo = vdupq_n_u8(0x03);
const uint8x16_t mask_hi = vdupq_n_u8(0x30);
const int8x16_t m32s = vdupq_n_s8(32);
float32x4_t acc_f32[acc_size];
for (int y = 0; y < nr / q8_k_blocklen; y++) {
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * GGML_RESTRICT q6_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int i = 0; i < acc_size; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
float32x4_t q6_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d));
float32x4_t q6_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q6_ptr[b].d + 4));
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
float32x4_t sbd_scale_0123[q8_k_blocklen];
float32x4_t sbd_scale_4567[q8_k_blocklen];
sbd_scale_0123[0] = vmulq_laneq_f32(q6_d_0123, q8_d_0123, 0);
sbd_scale_4567[0] = vmulq_laneq_f32(q6_d_4567, q8_d_0123, 0);
sbd_scale_0123[1] = vmulq_laneq_f32(q6_d_0123, q8_d_0123, 1);
sbd_scale_4567[1] = vmulq_laneq_f32(q6_d_4567, q8_d_0123, 1);
sbd_scale_0123[2] = vmulq_laneq_f32(q6_d_0123, q8_d_0123, 2);
sbd_scale_4567[2] = vmulq_laneq_f32(q6_d_4567, q8_d_0123, 2);
sbd_scale_0123[3] = vmulq_laneq_f32(q6_d_0123, q8_d_0123, 3);
sbd_scale_4567[3] = vmulq_laneq_f32(q6_d_4567, q8_d_0123, 3);
int32x4_t acc_s32[acc_size];
for (int i = 0; i < acc_size; i++) {
acc_s32[i] = vdupq_n_s32(0);
}
int16_t q6_scales[8 * 16];
for (int i = 0; i < 16; i++) {
int16x8_t scales = vmovl_s8(vld1_s8(q6_ptr[b].scales + i * 8));
vst1q_s16(q6_scales + i * 8, scales);
}
for (int half = 0; half < 2; half++) {
const uint8_t * ql_base = q6_ptr[b].ql + half * 512;
const uint8_t * qh_base = q6_ptr[b].qh + half * 256;
for (int sb = 0; sb < QK_K / 64; sb++) {
int32x4_t acc_lo[acc_size];
int32x4_t acc_hi[acc_size];
for (int i = 0; i < acc_size; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
const int8_t * q8_base_l = q8_ptr[b].qs + half * 512 + sb * 64;
const int8_t * q8_base_h = q8_ptr[b].qs + half * 512 + 256 + sb * 64;
// 4 rows * 16 elements per scale
// 4 reads of 16 bytes each
constexpr int reads_per_sb = 4;
int8x16_t q8_l[reads_per_sb];
int8x16_t q8_h[reads_per_sb];
for (int k = 0; k < reads_per_sb; k++) {
q8_l[k] = vld1q_s8(q8_base_l + 16 * k);
q8_h[k] = vld1q_s8(q8_base_h + 16 * k);
}
const int ql_off_base = sb * QK_K / 2;
const int qh_off_base = ql_off_base & 255;
uint8x16_t q6_ql_0123[reads_per_sb];
uint8x16_t q6_ql_4567[reads_per_sb];
uint8x16_t q6_qh_0123[reads_per_sb];
uint8x16_t q6_qh_4567[reads_per_sb];
for (int k = 0; k < reads_per_sb; k++) {
q6_ql_0123[k] = vld1q_u8(ql_base + ql_off_base + k * 32);
q6_ql_4567[k] = vld1q_u8(ql_base + ql_off_base + k * 32 + 16);
q6_qh_0123[k] = vld1q_u8(qh_base + qh_off_base + k * 32);
q6_qh_4567[k] = vld1q_u8(qh_base + qh_off_base + k * 32 + 16);
}
if (sb > 1) {
for (int k = 0; k < reads_per_sb; k++) {
q6_qh_0123[k] = vshrq_n_u8(q6_qh_0123[k], 2);
q6_qh_4567[k] = vshrq_n_u8(q6_qh_4567[k], 2);
}
}
for (int k = 0; k < reads_per_sb; k++) {
// q = (ql | qh) - 32
const uint8x16_t hbit_lo_0123 = vandq_u8(q6_qh_0123[k], mask_lo);
const uint8x16_t hbit_hi_0123 = vandq_u8(q6_qh_0123[k], mask_hi);
const uint8x16_t hbit_lo_4567 = vandq_u8(q6_qh_4567[k], mask_lo);
const uint8x16_t hbit_hi_4567 = vandq_u8(q6_qh_4567[k], mask_hi);
const int8x16_t q6_0123_lo = vsubq_s8(
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q6_ql_0123[k], m4b), hbit_lo_0123, 4)), m32s);
const int8x16_t q6_0123_hi = vsubq_s8(
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_ql_0123[k], 4), hbit_hi_0123)), m32s);
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q6_0123_lo, q8_l[k], 0); // 0..3 r0 c0123
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q6_0123_lo, q8_l[k], 1); // 0..3 r1 c0123
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q6_0123_lo, q8_l[k], 2); // 0..3 r2 c0123
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q6_0123_lo, q8_l[k], 3); // 0..3 r3 c0123
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q6_0123_hi, q8_h[k], 0); // 64..67 r0 c0123
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q6_0123_hi, q8_h[k], 1); // 64..67 r1 c0123
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q6_0123_hi, q8_h[k], 2); // 64..67 r2 c0123
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q6_0123_hi, q8_h[k], 3); // 64..67 r3 c0123
const int8x16_t q6_4567_lo = vsubq_s8(
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q6_ql_4567[k], m4b), hbit_lo_4567, 4)), m32s);
const int8x16_t q6_4567_hi = vsubq_s8(
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6_ql_4567[k], 4), hbit_hi_4567)), m32s);
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q6_4567_lo, q8_l[k], 0); // 0..3 r0 c4567
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q6_4567_lo, q8_l[k], 1); // 0..3 r1 c4567
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q6_4567_lo, q8_l[k], 2); // 0..3 r2 c4567
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q6_4567_lo, q8_l[k], 3); // 0..3 r3 c4567
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q6_4567_hi, q8_h[k], 0); // 64..67 r0 c4567
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q6_4567_hi, q8_h[k], 1); // 64..67 r1 c4567
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q6_4567_hi, q8_h[k], 2); // 64..67 r2 c4567
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q6_4567_hi, q8_h[k], 3); // 64..67 r3 c4567
}
// Scale and bias
const int scale_idx_l = half * 8 + sb;
const int scale_idx_h = half * 8 + sb + 4;
for (int g = 0; g < col_groups; g++) {
const int16x4_t scales_l16 = vld1_s16(q6_scales + scale_idx_l * 8 + g * 4);
const int16x4_t scales_h16 = vld1_s16(q6_scales + scale_idx_h * 8 + g * 4);
const int32x4_t scale_vec_l = vmovl_s16(scales_l16);
const int32x4_t scale_vec_h = vmovl_s16(scales_h16);
const int acc_offset = g * q8_k_blocklen;
for (int row = 0; row < q8_k_blocklen; row++) {
const int idx = row * 2 + g;
acc_s32[idx] = vmlaq_s32(acc_s32[idx], acc_lo[acc_offset + row], scale_vec_l);
acc_s32[idx] = vmlaq_s32(acc_s32[idx], acc_hi[acc_offset + row], scale_vec_h);
}
}
}
}
// Finally we apply the superblock scales
for (int row = 0; row < q8_k_blocklen; row++) {
const int idx0 = 2 * row;
const int idx1 = 2 * row + 1;
const int32x4_t acc_0123 = acc_s32[idx0];
const int32x4_t acc_4567 = acc_s32[idx1];
acc_f32[idx0] = vmlaq_f32(acc_f32[idx0], vcvtq_f32_s32(acc_0123), sbd_scale_0123[row]);
acc_f32[idx1] = vmlaq_f32(acc_f32[idx1], vcvtq_f32_s32(acc_4567), sbd_scale_4567[row]);
}
} // for b
for (int i = 0; i < q8_k_blocklen; i++) {
int row = y * q8_k_blocklen + i;
for (int j = 0; j < 2; j++) {
int col = x * ncols_interleaved + j * 4;
int offset = row * bs + col;
vst1q_f32(s + offset, acc_f32[2 * i + j]);
}
}
} // for x
} // for y
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemm_q6_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q6_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
+229 -196
View File
@@ -256,6 +256,200 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
}
template <int M, int N>
static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int blocklen = M;
constexpr int ncols_interleaved = N;
const int qk = QK_K;
const int nb = n / qk;
const int blocks_per_half = 64 / blocklen;
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[8];
const block_q8_K * a_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0f;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
const int base_h = base_l + 64;
const int scale_idx_l = base_l / 16;
const int scale_idx_h = base_h / 16;
const int qh_shift_l = ((base_l % 128) / 32) * 2;
const int qh_shift_h = ((base_h % 128) / 32) * 2;
const int qh_half_l = (base_l / 128) * 32;
const int qh_half_h = (base_h / 128) * 32;
for (int j = 0; j < ncols_interleaved; j++) {
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
int sumi_l = 0;
int sumi_h = 0;
for (int i = 0; i < blocklen; i++) {
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
const int qh_chunk_l = qh_idx_l / blocklen;
const int qh_pos_l = qh_idx_l % blocklen;
const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
const int qh_chunk_h = qh_idx_h / blocklen;
const int qh_pos_h = qh_idx_h % blocklen;
const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
const int q_l = ((hi_2_l << 4) | l_4) - 32;
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
const int8_t a_l = a_ptr[l].qs[base_l + i];
const int8_t a_h = a_ptr[l].qs[base_h + i];
sumi_l += q_l * a_l;
sumi_h += q_h * a_h;
}
sumf[j] +=
(sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j];
}
}
}
template <int M, int N>
static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int blocklen = M;
constexpr int ncols_interleaved = N;
const int qk = QK_K;
const int nb = n / qk;
const int blocks_per_half = 64 / blocklen;
const int q8_half_stride = 512;
const int q8_low_high_step = 256;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
float sumf[4][8];
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0f;
}
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
const int base_h = base_l + 64;
const int scale_idx_l = base_l / 16;
const int scale_idx_h = base_h / 16;
const int qh_shift_l = ((base_l % 128) / 32) * 2;
const int qh_shift_h = ((base_h % 128) / 32) * 2;
const int qh_half_l = (base_l / 128) * 32;
const int qh_half_h = (base_h / 128) * 32;
const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
int sumi_l = 0;
int sumi_h = 0;
for (int i = 0; i < blocklen; i++) {
const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
const int qh_chunk_l = qh_idx_l / blocklen;
const int qh_pos_l = qh_idx_l % blocklen;
const int qh_offset_l =
qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
const int qh_chunk_h = qh_idx_h / blocklen;
const int qh_pos_h = qh_idx_h % blocklen;
const int qh_offset_h =
qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
const int q_l = ((hi_2_l << 4) | l_4) - 32;
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
sumi_l += q_l * q8_l;
sumi_h += q_h * q8_h;
}
sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
}
}
}
}
}
extern "C" {
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -704,94 +898,12 @@ void ggml_gemv_q5_K_8x8_q8_K_generic(int n,
}
void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[8];
const block_q8_K * a_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0f;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < 16; k++) {
// k = 0.. 7 weights 0-63 low, 64-127 high
// k = 8..15 weights 128-191 low, 192-255 high
const int base_l = (k / 8) * 128 + (k % 8) * 8;
const int base_h = base_l + 64;
const int scale_idx_l = base_l / 16;
const int scale_idx_h = base_h / 16;
// Bit shift cycles 0,2,4,6 for each 32-value group within a 128-value half
const int qh_shift_l = ((base_l % 128) / 32) * 2;
const int qh_shift_h = ((base_h % 128) / 32) * 2;
// qh_half: offset to the correct 32-byte half (0 or 32)
const int qh_half_l = (base_l / 128) * 32;
const int qh_half_h = (base_h / 128) * 32;
for (int j = 0; j < ncols_interleaved; j++) {
// Interleaved scales
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * 8 + j];
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * 8 + j];
int sumi_l = 0;
int sumi_h = 0;
for (int i = 0; i < blocklen; i++) {
const int ql_pos = k * 64 + j * 8 + i;
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
// qh indexing with 8-byte interleaving (like q5_K)
const int qh_byte_l = qh_half_l + ((base_l + i) % 32);
const int qh_chunk_l = qh_byte_l / 8;
const int qh_pos_l = qh_byte_l % 8;
const int qh_offset_l = qh_chunk_l * 64 + j * 8 + qh_pos_l;
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
const int qh_byte_h = qh_half_h + ((base_h + i) % 32);
const int qh_chunk_h = qh_byte_h / 8;
const int qh_pos_h = qh_byte_h % 8;
const int qh_offset_h = qh_chunk_h * 64 + j * 8 + qh_pos_h;
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
const int q_l = ((hi_2_l << 4) | l_4) - 32;
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
const int8_t a_l = a_ptr[l].qs[base_l + i];
const int8_t a_h = a_ptr[l].qs[base_h + i];
sumi_l += q_l * a_l;
sumi_h += q_h * a_h;
}
sumf[j] +=
(sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j];
}
}
ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1485,109 +1597,12 @@ void ggml_gemm_q5_K_8x8_q8_K_generic(int n,
}
}
void ggml_gemm_q6_K_8x8_q8_K_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
}
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
float sumf[4][8];
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0f;
}
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < 16; k++) {
// k = 0.. 7 weights 0-63 low, 64-127 high
// k = 8..15 weights 128-191 low, 192-255 high
const int base_l = (k / 8) * 128 + (k % 8) * 8;
const int base_h = base_l + 64;
const int scale_idx_l = base_l / 16;
const int scale_idx_h = base_h / 16;
// Bit shift cycles 0,2,4,6 for each 32-value group within a 128-value half
const int qh_shift_l = ((base_l % 128) / 32) * 2;
const int qh_shift_h = ((base_h % 128) / 32) * 2;
// qh_half: offset to the correct 32-byte half (0 or 32)
const int qh_half_l = (base_l / 128) * 32;
const int qh_half_h = (base_h / 128) * 32;
// Activation base indices for q8_Kx4 interleaved format
// Layout: 128-value halves (k/8), then 8-value sub-blocks (k%8) with stride 32
const int q8_base = (k / 8) * 512 + (k % 8) * 32;
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
// Interleaved scales
const int8_t scale_l = b_ptr[l].scales[scale_idx_l * 8 + j];
const int8_t scale_h = b_ptr[l].scales[scale_idx_h * 8 + j];
int sumi_l = 0;
int sumi_h = 0;
for (int i = 0; i < blocklen; i++) {
const int ql_pos = k * 64 + j * 8 + i;
const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
const int qh_chunk_l = qh_idx_l / 8;
const int qh_pos_l = qh_idx_l % 8;
const int qh_offset_l = qh_chunk_l * 64 + j * 8 + qh_pos_l;
const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
const int qh_chunk_h = qh_idx_h / 8;
const int qh_pos_h = qh_idx_h % 8;
const int qh_offset_h = qh_chunk_h * 64 + j * 8 + qh_pos_h;
const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
const int q_l = ((hi_2_l << 4) | l_4) - 32;
const int q_h = ((hi_2_h << 4) | hi_4) - 32;
const int8_t q8_l = a_ptr[l].qs[q8_base + m * 8 + i];
const int8_t q8_h = a_ptr[l].qs[q8_base + m * 8 + i + 256];
sumi_l += q_l * q8_l;
sumi_h += q_h * q8_h;
}
sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
}
}
}
}
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -2097,18 +2112,18 @@ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_in
}
const int end_ls = QK_K * 4 / blck_size_interleave;
// Interleave Q6_K quants by taking 8 bytes at a time
// Interleave Q6_K quants by taking blck_size_interleave bytes at a time
for (int i = 0; i < end_ls; ++i) {
int src_id = i % n_blocks;
int src_offset = (i / n_blocks) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elem_ls;
memcpy(&elem_ls, &in[src_id].ql[src_offset], sizeof(uint64_t));
memcpy(&out.ql[dst_offset], &elem_ls, sizeof(uint64_t));
memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
}
// Interleave high bits using same 8-byte pattern as low bits
// Interleave high bits using same chunk size as low bits
const int end_hs = end_ls / 2;
for (int i = 0; i < end_hs; ++i) {
int src_id = i % n_blocks;
@@ -2116,8 +2131,8 @@ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_in
int dst_offset = i * blck_size_interleave;
uint64_t elem_hs;
memcpy(&elem_hs, &in[src_id].qh[src_offset], sizeof(uint64_t));
memcpy(&out.qh[dst_offset], &elem_hs, sizeof(uint64_t));
memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
}
// The below logic is designed so as to unpack and rearrange scales in Q6_K
@@ -2262,7 +2277,7 @@ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
GGML_ASSERT(interleave_block == 8);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
constexpr int nrows_interleaved = 8;
block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
@@ -2511,6 +2526,10 @@ template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * da
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
}
template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
}
template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
}
@@ -2575,6 +2594,10 @@ template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
@@ -2634,6 +2657,10 @@ template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
@@ -3043,6 +3070,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
// instance for Q6_K
static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
// instance for Q2
@@ -3107,6 +3135,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &q6_K_8x8_q8_K;
}
}
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
if (cur->ne[1] % 8 == 0) {
return &q6_K_8x4_q8_K;
}
}
} else if (cur->type == GGML_TYPE_IQ4_NL) {
if (ggml_cpu_has_avx2()) {
if (cur->ne[1] % 8 == 0) {
+4
View File
@@ -112,6 +112,7 @@ void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -122,6 +123,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -142,6 +144,7 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -152,6 +155,7 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+1 -1
View File
@@ -8523,7 +8523,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_rope(type, { 80, 32, 512, 1}, 20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm)
test_cases.emplace_back(new test_rope(type, { 64, 8, 512, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
test_cases.emplace_back(new test_rope(type, {128, 12, 512, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
}
}
+2 -2
View File
@@ -34,7 +34,7 @@ $ build/bin/llama-quantize models/outetts-0.2-0.5B-f16.gguf \
```
The quantized model will be `models/outetts-0.2-0.5B-q8_0.gguf`.
Next we do something simlar for the audio decoder. First download or checkout
Next we do something similar for the audio decoder. First download or checkout
the model for the voice decoder:
```console
$ pushd models
@@ -42,7 +42,7 @@ $ git clone --branch main --single-branch --depth 1 https://huggingface.co/novat
$ cd WavTokenizer-large-speech-75token && git lfs install && git lfs pull
$ popd
```
This model file is PyTorch checkpoint (.ckpt) and we first need to convert it to
This model file is a PyTorch checkpoint (.ckpt) and we first need to convert it to
huggingface format:
```console
(venv) python tools/tts/convert_pt_to_hf.py \