Compare commits

...

8 Commits

Author SHA1 Message Date
R0CKSTAR 3025b621d1 llama-bench: rename DB table name from test to llama_bench (#15003)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
2025-08-02 17:20:40 +08:00
Jeff Bolz ec0b18802c vulkan: Support ne[3]>1 in noncontig matrix-vector multiply (#15015) 2025-08-02 10:48:30 +02:00
Douglas Hanley 339bd0268c model : support Qwen3-Embedding (#15023) 2025-08-02 10:44:50 +02:00
Johannes Gäßler f906275537 server: enable token array inputs for OAI API (#15001) 2025-08-02 10:12:41 +02:00
Jeff Bolz a9f7541ec2 vulkan: optimizations for direct convolution (#14933)
* vulkan: optimizations for direct convolution

- Empirically choose a better tile size. Reducing BS_K/BS_NPQ helps fill
  the GPU. The new size should be amenable to using coopmat, too.
- Fix shmem bank conflicts. 16B padding should work with coopmat.
- Some explicit loop unrolling.
- Skip math/stores work for parts of the tile that are OOB.
- Apply fastdiv opt.
- Disable shuffles for NV.

* Three tiles sizes for CONV_2D, and a heuristic to choose

* reallow collectives for pre-Turing

* make SHMEM_PAD a spec constant

* fixes for intel perf - no shmem padding, placeholder shader core count

* shader variants with/without unrolling

* 0cc4m's fixes for AMD perf

Co-authored-by: 0cc4m <picard12@live.de>

---------

Co-authored-by: 0cc4m <picard12@live.de>
2025-08-02 09:57:04 +02:00
Johannes Gäßler 9c35706b98 CUDA: fix MMQ nwarps for AMD with warp_size==32 (#15014) 2025-08-01 20:47:32 +02:00
l-austenfeld c76b420e4c vendor : update vendored copy of google/minja (#15011)
* vendor : update vendored copy of google/minja

Signed-off-by: Lennart Austenfeld <l.austenfeld@googlemail.com>

* Re-remove trailing whitespace

Signed-off-by: Lennart Austenfeld <l.austenfeld@googlemail.com>

* Remove another trailing whitespace

Signed-off-by: Lennart Austenfeld <l.austenfeld@googlemail.com>

---------

Signed-off-by: Lennart Austenfeld <l.austenfeld@googlemail.com>
2025-08-01 16:59:06 +02:00
stevenkuang 0f5ccd6fd1 model : add hunyuan dense (#14878)
* support hunyuan_v1_dense

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* update hunyuan_moe to hunyuan_v1_moe

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* fix rope alpha assert and bos token

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* add blank line

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* Revert "update hunyuan_moe to hunyuan_v1_moe"

This reverts commit aa973ca219.

* use hunyuan_dense instead of hunyuan_v1_dense

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* fix hunyuan_moe chat template

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* remove leftover code

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* update hunyuan dense chat template

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

* fix hunyuan dense vocab and chat template

Signed-off-by: stevenkuang <stevenkuang@tencent.com>

---------

Signed-off-by: stevenkuang <stevenkuang@tencent.com>
2025-08-01 15:31:12 +02:00
22 changed files with 673 additions and 164 deletions
+98 -8
View File
@@ -684,6 +684,9 @@ class TextModel(ModelBase):
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
res = "hunyuan"
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
res = "hunyuan-dense"
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
res = "falcon-h1"
@@ -846,6 +849,9 @@ class TextModel(ModelBase):
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
res = "exaone4"
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-8B
res = "qwen2"
if res is None:
logger.warning("\n")
@@ -7553,11 +7559,6 @@ class FalconH1Model(Mamba2Model):
class HunYuanMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# For handling tied embeddings
self._tok_embd = None
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -7651,9 +7652,6 @@ class HunYuanMoEModel(TextModel):
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "model.embed_tokens.weight":
self._tok_embd = data_torch.clone()
if name == "lm_head.weight":
if self.hparams.get("tie_word_embeddings", False):
logger.info("Skipping tied output layer 'lm_head.weight'")
@@ -7698,6 +7696,98 @@ class HunYuanMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
# 1. Get the pre-tokenizer identifier hash
tokpre = self.get_vocab_base_pre(tokenizer)
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
if len(merged) == 2:
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
token = reverse_vocab[i]
tokens.append(token)
if i in special_tokens.values():
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)
# 4. Write all vocab-related fields to the GGUF writer
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_token_merges(merges)
# 5. Add special tokens and chat templates
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
# FIX for BOS token: Overwrite incorrect id read from config.json
if self.hparams['hidden_size'] == 4096:
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
# Rope
rope_scaling = hparams.get("rope_scaling", {})
if rope_scaling.get("type") == "dynamic":
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
alpha = rope_scaling.get("alpha", 50)
base = hparams.get("rope_theta", 10000.0)
dim = hparams["head_dim"]
scaled_base = base * (alpha ** (dim / (dim - 2)))
self.gguf_writer.add_rope_freq_base(scaled_base)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(1)
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "lm_head.weight":
if self.hparams.get("tie_word_embeddings", False):
logger.info("Skipping tied output layer 'lm_head.weight'")
return []
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3
+1
View File
@@ -140,6 +140,7 @@ pre_computed_hashes = [
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
+7 -11
View File
@@ -251,25 +251,21 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
#endif // AMD_MFMA_AVAILABLE
#if defined(GGML_USE_HIP)
static int mmq_get_nwarps_host(const int cc) {
return amd_mfma_available(cc) ? 8 : 4;
static int mmq_get_nwarps_host(const int cc, const int warp_size) {
return amd_mfma_available(cc) ? 8 : 256/warp_size;
}
#else
static int mmq_get_nwarps_host(const int /*cc*/) {
return 8;
static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
return 256/warp_size;
}
#endif // (GGML_USE_HIP)
static constexpr __device__ int mmq_get_nwarps_device() {
#if defined(GGML_USE_HIP)
#if defined(AMD_MFMA_AVAILABLE)
return 8;
#else
return 4;
return 256/ggml_cuda_get_physical_warp_size();
#endif // AMD_MFMA_AVAILABLE
#else
return 8;
#endif // defined(GGML_USE_HIP)
}
// ------------------------------------------------------------
@@ -3472,7 +3468,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int cc = ggml_cuda_info().devices[id].cc;
const int nsm = ggml_cuda_info().devices[id].nsm;
const int warp_size = ggml_cuda_info().devices[id].warp_size;
const int nwarps = mmq_get_nwarps_host(cc);
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
const int mmq_y = get_mmq_y_host(cc);
const dim3 block_dims(warp_size, nwarps, 1);
@@ -3559,7 +3555,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
const int cc = ggml_cuda_info().devices[id].cc;
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
const int warp_size = ggml_cuda_info().devices[id].warp_size;
const int nwarps = mmq_get_nwarps_host(cc);
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
const int mmq_x_max = get_mmq_x_max_host(cc);
const int mmq_y = get_mmq_y_host(cc);
+184 -73
View File
@@ -222,6 +222,7 @@ enum vk_device_architecture {
AMD_RDNA2,
AMD_RDNA3,
INTEL_XE2,
NVIDIA_PRE_TURING,
};
// HSK x HSV
@@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
return vk_device_architecture::INTEL_XE2;
}
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
bool cooperative_matrix = false;
// Detect "pre-turing" based on lack of coopmat support.
for (const auto& properties : ext_props) {
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
cooperative_matrix = true;
break;
}
}
if (!cooperative_matrix) {
return vk_device_architecture::NVIDIA_PRE_TURING;
}
}
return vk_device_architecture::OTHER;
}
enum vk_conv_shapes {
CONV_SHAPE_128x128,
CONV_SHAPE_64x32,
CONV_SHAPE_32x256,
CONV_SHAPE_COUNT,
};
struct vk_device_struct {
std::recursive_mutex mutex;
@@ -483,8 +507,8 @@ struct vk_device_struct {
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
vk_pipeline pipeline_opt_step_adamw_f32;
vk_pipeline pipeline_conv2d_f32;
vk_pipeline pipeline_conv2d_f16_f32;
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
vk_pipeline pipeline_conv2d_dw_whcn_f32;
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
@@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
// init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
uint32_t KWmp; uint32_t KWL;
uint32_t KWKHmp; uint32_t KWKHL;
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
};
template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
// Compute magic values to divide by KW, KW*KH, OW, OW*OH
init_fastdiv_values(p.KW, p.KWmp, p.KWL);
init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
}
struct vk_op_conv2d_dw_push_constants {
uint32_t ne;
uint32_t batches;
@@ -2847,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
}
}
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@@ -3048,48 +3086,89 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
// conv2d
uint32_t conv2d_WG_SIZE = 256;
uint32_t conv2d_BS_K = 128;
uint32_t conv2d_BS_CRS = 16;
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
if (device->subgroup_shuffle &&
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
use_collectives = 1;
conv2d_BS_CRS = std::min(
device->subgroup_size,
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
}
uint32_t conv2d_BS_NPQ = 128;
uint32_t conv2d_TS_K = 8;
uint32_t conv2d_shmem_req =
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
conv2d_BS_CRS = 8;
if (use_collectives) {
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
}
}
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
uint32_t conv2d_WG_SIZE = 256;
uint32_t conv2d_BS_K = 128;
uint32_t conv2d_BS_CRS = 16;
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
uint32_t conv2d_BS_NPQ = 128;
uint32_t conv2d_TS_K = 8;
uint32_t conv2d_SHMEM_PAD = 4;
bool conv2d_UNROLL = true;
if (use_collectives) {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
} else {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
false);
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
false);
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
conv2d_SHMEM_PAD = 0;
conv2d_UNROLL = false;
} else if (device->vendor_id == VK_VENDOR_ID_AMD) {
conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
}
switch (s) {
default:
case CONV_SHAPE_128x128:
conv2d_BS_K = 128;
conv2d_BS_NPQ = 128;
conv2d_BS_CRS = 16;
if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
conv2d_UNROLL = false;
}
break;
case CONV_SHAPE_64x32:
conv2d_BS_K = 64;
conv2d_BS_NPQ = 32;
conv2d_BS_CRS = 32;
conv2d_TS_K = 4;
break;
case CONV_SHAPE_32x256:
conv2d_BS_K = 32;
conv2d_BS_NPQ = 256;
conv2d_BS_CRS = 16;
break;
}
// Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
device->architecture == vk_device_architecture::AMD_GCN;
if (device->subgroup_shuffle &&
device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
allow_collectives_nv &&
allow_collectives_amd) {
use_collectives = 1;
conv2d_BS_CRS = std::min(
device->subgroup_size,
conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
}
uint32_t conv2d_shmem_req =
(conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
conv2d_BS_CRS = 8;
if (use_collectives) {
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
}
}
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
if (conv2d_UNROLL) {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
} else {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
}
}
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -5742,7 +5821,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
const uint64_t ne00 = src0->ne[0];
const uint64_t ne01 = src0->ne[1];
const uint64_t ne02 = src0->ne[2];
// const uint64_t ne03 = src0->ne[3];
const uint64_t ne03 = src0->ne[3];
const uint64_t nb01 = src0->nb[1];
const uint64_t nb02 = src0->nb[2];
@@ -5754,7 +5833,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
const uint64_t ne12 = src1->ne[2];
// const uint64_t ne13 = src1->ne[3];
const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
GGML_ASSERT(ne11 == 1);
GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
@@ -5770,7 +5854,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
src1_uma = d_Qy != nullptr;
}
const uint64_t d_ne = ne01 * ne11 * ne12;
const uint64_t d_ne = ne01 * ne11 * ne12 * ne03;
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
@@ -5805,10 +5889,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
// compute
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
}
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -6641,6 +6725,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
}
}
static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
// src0 - kernel: [KW, KH, Cin, Cout]
// src1 - input: [W, H, Cin, N]
// dst - result: [OW, OH, Cout, N]
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
int64_t W = src1->ne[0];
int64_t H = src1->ne[1];
int64_t KW = src0->ne[0];
int64_t KH = src0->ne[1];
int64_t Cout = src0->ne[3];
int64_t N = src1->ne[3];
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
int64_t NPQ = N * OW * OH;
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
return elements;
}
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
switch (op) {
case GGML_OP_GET_ROWS:
@@ -6970,10 +7082,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
case GGML_OP_CONV_2D:
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
auto elements = ggml_vk_get_conv_elements(dst);
vk_conv_shapes shape;
uint32_t tiles[CONV_SHAPE_COUNT];
for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
}
// We can't query number of shader cores on Intel, use 32 as a placeholder
// so small convolutions will still choose a smaller tile.
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
shape = CONV_SHAPE_128x128;
} else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
shape = CONV_SHAPE_32x256;
} else {
shape = CONV_SHAPE_64x32;
}
if (src0->type == GGML_TYPE_F32) {
return ctx->device->pipeline_conv2d_f32;
return ctx->device->pipeline_conv2d_f32[shape];
} else if (src0->type == GGML_TYPE_F16) {
return ctx->device->pipeline_conv2d_f16_f32;
return ctx->device->pipeline_conv2d_f16_f32[shape];
}
}
return nullptr;
@@ -7301,29 +7433,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
} break;
case GGML_OP_CONV_2D:
{
// src0 - kernel: [KW, KH, Cin, Cout]
// src1 - input: [W, H, Cin, N]
// dst - result: [OW, OH, Cout, N]
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
int64_t W = src1->ne[0];
int64_t H = src1->ne[1];
int64_t KW = src0->ne[0];
int64_t KH = src0->ne[1];
int64_t Cout = src0->ne[3];
int64_t N = src1->ne[3];
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
int64_t NPQ = N * OW * OH;
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
}
break;
elements = ggml_vk_get_conv_elements(dst);
} break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
@@ -1,14 +1,13 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#ifdef USE_COLLECTIVES
# extension GL_KHR_shader_subgroup_shuffle : enable
#endif
#include "types.comp"
// Make spec constant
#define SHMEM_PAD 0
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
layout(binding = 0) readonly buffer A {
A_TYPE knl_data[];
@@ -56,6 +55,12 @@ layout(push_constant) uniform parameter {
uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
// fastdiv helper values
uint32_t KWmp; uint32_t KWL;
uint32_t KWKHmp; uint32_t KWKHL;
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
}
p;
@@ -68,6 +73,7 @@ layout(constant_id = 3) const uint BS_NPQ = 128;
// Thread-tile sizes
layout(constant_id = 4) const uint TS_K = 8;
layout(constant_id = 5) const uint use_collectives = 1;
layout(constant_id = 6) const uint SHMEM_PAD = 4;
uint32_t tid = gl_LocalInvocationID.x;
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
@@ -131,6 +137,14 @@ uint32_t Br = tid / BS_NPQ;
uint32_t Bc = tid % BS_NPQ;
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
void main() {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
@@ -151,9 +165,9 @@ void main() {
uint32_t cached_KW_idx;
if (use_collectives == 1) {
cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH);
cached_Cin_idx = fastdiv(cached_CRS_idx, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
cached_KH_idx = cached_CRS_remainder / p.KW;
cached_KH_idx = fastdiv(cached_CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW;
CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
@@ -162,16 +176,16 @@ void main() {
KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
} else {
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
KH_idx_a = CRS_remainder / p.KW;
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
}
#else
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH); / (p.KW * p.KH);
CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
KH_idx_a = CRS_remainder / p.KW;
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
#endif
@@ -188,13 +202,13 @@ void main() {
Ash[B_ly * Ash_stride + B_lx] = val;
}
/* Load input to B_block: (BS_CRS x BS_NPQ) */
for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
uint32_t B_ly = r_offset + Br; /* Row index of B block */
uint32_t B_lx = Bc;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
uint32_t OH_idx = NPQ_remainder / p.OW;
uint32_t OH_idx = fastdiv(NPQ_remainder, p.OWmp, p.OWL); // divide by p.OW;
uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW;
uint32_t CRS_idx_b;
@@ -209,16 +223,16 @@ void main() {
KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
} else {
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
KH_idx_b = CRS_remainder / p.KW;
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
}
#else
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
KH_idx_b = CRS_remainder / p.KW;
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
#endif
@@ -233,32 +247,36 @@ void main() {
Bsh[B_ly * Bsh_stride + B_lx] = val;
}
barrier();
for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
}
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
}
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
if (T_y * TS_K < K) {
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
}
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
}
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
}
}
}
}
barrier();
}
/* Save C* */
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
if (K_idx < K && NPQ_idx < NPQ) {
dst_data[dst_idx] = regC[T_ly][T_lx];
if (T_y * TS_K < K) {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
if (K_idx < K && NPQ_idx < NPQ) {
dst_data[dst_idx] = regC[T_ly][T_lx];
}
}
}
}
@@ -26,6 +26,9 @@ layout (push_constant) uniform parameter
uint ne12;
uint b_offset;
uint d_offset;
uint nb03;
uint nb13;
uint nb23;
} p;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
@@ -34,6 +37,7 @@ void main() {
const uint tid = gl_LocalInvocationID.x;
const uint row_x = gl_GlobalInvocationID.y;
const uint channel = gl_GlobalInvocationID.z;
const uint i3 = gl_WorkGroupID.x;
const uint channel_x = channel / p.channel_x_divisor;
const uint channel_y = channel % p.ne12;
@@ -41,7 +45,7 @@ void main() {
const uint nrows_dst = p.nrows_x;
const uint row_dst = row_x;
const uint idst = channel*nrows_dst + row_dst;
const uint idst = i3*p.nb23 + channel*nrows_dst + row_dst;
FLOAT_TYPE temp = 0.0f;
@@ -58,8 +62,8 @@ void main() {
const uint row_y = col_x;
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = channel_y*p.channel_stride_y + row_y;
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
const vec4 av4 = vec4(data_a_v4[ix / 4]);
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@@ -74,8 +78,8 @@ void main() {
const uint row_y = col_x;
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = channel_y*p.channel_stride_y + row_y;
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
const vec4 av4 = vec4(data_a_v4[ix / 4]);
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
@@ -91,8 +95,8 @@ void main() {
const uint row_y = col_x;
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = channel_y*p.channel_stride_y + row_y;
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
@@ -655,8 +655,11 @@ void process_shaders() {
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
+18
View File
@@ -376,6 +376,7 @@ class MODEL_ARCH(IntEnum):
ERNIE4_5 = auto()
ERNIE4_5_MOE = auto()
HUNYUAN_MOE = auto()
HUNYUAN_DENSE = auto()
SMOLLM3 = auto()
LFM2 = auto()
DREAM = auto()
@@ -697,6 +698,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
MODEL_ARCH.SMOLLM3: "smollm3",
MODEL_ARCH.LFM2: "lfm2",
MODEL_ARCH.DREAM: "dream",
@@ -2471,6 +2473,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.HUNYUAN_DENSE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.SMOLLM3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
+14 -2
View File
@@ -33,6 +33,7 @@ class TensorNameMap:
"language_model.model.embed_tokens", # llama4
"encoder", # neobert
"model.transformer.wte", # llada
"embed_tokens", # qwen3-embedding
),
# Token type embeddings
@@ -143,6 +144,7 @@ class TensorNameMap:
"transformer_encoder.{bid}.attention_norm", # neobert
"model.layers.{bid}.operator_norm", # lfm2
"model.transformer.blocks.{bid}.attn_norm", # llada
"layers.{bid}.input_layernorm", # qwen3-embedding
),
# Attention norm 2
@@ -188,6 +190,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.attention.q_proj", # exaone
"model.layers.{bid}.self_attn.q_proj", # llama4
"model.transformer.blocks.{bid}.q_proj", # llada
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
),
# Attention key
@@ -205,6 +208,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.attention.k_proj", # exaone
"model.layers.{bid}.self_attn.k_proj", # llama4
"model.transformer.blocks.{bid}.k_proj", # llada
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
),
# Attention value
@@ -221,6 +225,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.attention.v_proj", # exaone
"model.layers.{bid}.self_attn.v_proj", # llama4
"model.transformer.blocks.{bid}.v_proj", # llada
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
),
# Attention output
@@ -254,6 +259,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.o_proj", # llama4
"transformer_encoder.{bid}.wo", # neobert
"model.transformer.blocks.{bid}.attn_out", # llada
"layers.{bid}.self_attn.o_proj", # qwen3-embedding
),
# Attention output norm
@@ -300,6 +306,7 @@ class TensorNameMap:
"transformer_encoder.{bid}.ffn_norm", # neobert
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
"model.transformer.blocks.{bid}.ff_norm", # llada
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
),
# Post feed-forward norm
@@ -373,7 +380,8 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
"transformer_encoder.{bid}.ffn.w12", # neobert
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
"model.transformer.blocks.{bid}.up_proj", # llada
"model.transformer.blocks.{bid}.up_proj", # llada
"layers.{bid}.mlp.up_proj", # qwen3-embedding
),
MODEL_TENSOR.FFN_UP_EXP: (
@@ -416,6 +424,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
"model.transformer.blocks.{bid}.ff_proj", # llada
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
),
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -465,7 +474,8 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
"transformer_encoder.{bid}.ffn.w3", # neobert
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
"model.transformer.blocks.{bid}.ff_out", # llada
"model.transformer.blocks.{bid}.ff_out", # llada
"layers.{bid}.mlp.down_proj", # qwen3-embedding
),
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -497,6 +507,7 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm
"model.layers.layers.{bid}.mixer.q", # plamo2
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
),
MODEL_TENSOR.ATTN_K_NORM: (
@@ -508,6 +519,7 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm
"model.layers.layers.{bid}.mixer.k", # plamo2
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
),
MODEL_TENSOR.ROPE_FREQS: (
+5 -5
View File
@@ -326,7 +326,7 @@ class LlamaBenchDataSQLite3(LlamaBenchData):
# Set table name and schema based on tool
if self.tool == "llama-bench":
self.table_name = "test"
self.table_name = "llama_bench"
db_fields = LLAMA_BENCH_DB_FIELDS
db_types = LLAMA_BENCH_DB_TYPES
elif self.tool == "test-backend-ops":
@@ -409,8 +409,8 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
# Tool selection logic
if tool is None:
if "test" in table_names:
self.table_name = "test"
if "llama_bench" in table_names:
self.table_name = "llama_bench"
self.tool = "llama-bench"
elif "test_backend_ops" in table_names:
self.table_name = "test_backend_ops"
@@ -418,8 +418,8 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
else:
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
elif tool == "llama-bench":
if "test" in table_names:
self.table_name = "test"
if "llama_bench" in table_names:
self.table_name = "llama_bench"
self.tool = "llama-bench"
else:
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
+21
View File
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
{ LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_DREAM, "dream" },
@@ -1897,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_HUNYUAN_DENSE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_SMOLLM3,
{
+1
View File
@@ -89,6 +89,7 @@ enum llm_arch {
LLM_ARCH_ERNIE4_5,
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_HUNYUAN_DENSE,
LLM_ARCH_SMOLLM3,
LLM_ARCH_LFM2,
LLM_ARCH_DREAM,
+20 -1
View File
@@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
};
@@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_DOTS1;
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
} else if (tmpl_contains("<hy_place▁holder▁no▁2>") && tmpl_contains("<hy_place▁holder▁no▁3>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2;
}
@@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
if (role == "system") {
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
} else if (role == "assistant") {
ss << "<|startoftext|>" << message->content << "<|eos|>";
ss << message->content << "<|eos|>";
} else {
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
// tencent/Hunyuan-4B-Instruct
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (i == 0) {
if (role == "system") {
ss << chat[i]->content << "<hy_place▁holder▁no▁3>";
}
}
if (role == "assistant") {
ss << "<hy_Assistant>" << chat[i]->content << "<hy_place▁holder▁no▁2>";
} else if (role == "user") {
ss << "<hy_User>" << chat[i]->content << "<hy_Assistant>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
// moonshotai/Kimi-K2-Instruct
for (auto message : chat) {
+1
View File
@@ -46,6 +46,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_DOTS1,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
+189
View File
@@ -899,6 +899,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_QWEN3:
{
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@@ -1760,6 +1761,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_HUNYUAN_DENSE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_embd) {
case 1024: type = LLM_TYPE_0_5B; break;
case 2048: type = LLM_TYPE_1_8B; break;
case 3072: type = LLM_TYPE_4B; break;
case 4096: type = LLM_TYPE_7B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_SMOLLM3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5195,6 +5208,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
}
} break;
case LLM_ARCH_HUNYUAN_DENSE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_SMOLLM3:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -16930,6 +16976,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
}
};
struct llm_build_hunyuan_dense : public llm_graph_context {
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = build_norm(Kcur,
model.layers[il].attn_k_norm, nullptr,
LLM_NORM_RMS, il);
cb(Kcur, "Kcur_norm", il);
Qcur = build_norm(Qcur,
model.layers[il].attn_q_norm, nullptr,
LLM_NORM_RMS, il);
cb(Qcur, "Qcur_norm", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
// feed-forward network (non-MoE)
ggml_tensor * cur_mlp = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur_mlp, "ffn_out", il);
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_smollm3 : public llm_graph_context {
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17797,6 +17981,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
} break;
case LLM_ARCH_HUNYUAN_DENSE:
{
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
} break;
case LLM_ARCH_SMOLLM3:
{
llm = std::make_unique<llm_build_smollm3>(*this, params);
@@ -18016,6 +18204,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_MINICPM3:
case LLM_ARCH_DOTS1:
case LLM_ARCH_HUNYUAN_MOE:
case LLM_ARCH_HUNYUAN_DENSE:
case LLM_ARCH_LFM2:
case LLM_ARCH_SMALLTHINKER:
return LLAMA_ROPE_TYPE_NEOX;
+5
View File
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
regex_exprs = {
"\\p{N}{1,3}",
"[一-龥぀-ゟ゠-ヿ]+",
@@ -1964,6 +1965,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "hunyuan") {
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
clean_spaces = false;
} else if (
tokenizer_pre == "hunyuan-dense") {
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
clean_spaces = false;
} else if (
tokenizer_pre == "kimi-k2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
+1
View File
@@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
};
struct LLM_KV;
+9 -7
View File
@@ -5592,13 +5592,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
for (auto bs : {1,2,4,8}) {
for (auto nr : {1,4}) {
for (uint32_t m = 0; m < 2; ++m) {
for (uint32_t k = 0; k < 2; ++k) {
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
for (auto bs2 : {1,3}) {
for (auto bs : {1,2,4,8}) {
for (auto nr : {1,4}) {
for (uint32_t m = 0; m < 2; ++m) {
for (uint32_t k = 0; k < 2; ++k) {
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, bs2}, {nr, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, true));
}
}
}
}
+2 -2
View File
@@ -1738,7 +1738,7 @@ struct sql_printer : public printer {
void print_header(const cmd_params & params) override {
std::vector<std::string> fields = test::get_fields();
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
i < fields.size() - 1 ? "," : "");
@@ -1749,7 +1749,7 @@ struct sql_printer : public printer {
}
void print_test(const test & t) override {
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
fprintf(fout, "VALUES (");
std::vector<std::string> values = t.get_values();
for (size_t i = 0; i < values.size(); i++) {
-3
View File
@@ -4249,9 +4249,6 @@ int main(int argc, char ** argv) {
// process prompt
std::vector<server_tokens> inputs;
if (oaicompat && !prompt.is_string()) {
throw std::runtime_error("prompt must be a string");
}
if (oaicompat && has_mtmd) {
// multimodal
+7 -5
View File
@@ -162,10 +162,15 @@ class chat_template {
}), false);
caps_.supports_tools = contains(out, "some_tool");
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
json j_null;
auto make_tool_calls_msg = [&](const json & tool_calls) {
return json {
{"role", "assistant"},
{"content", nullptr},
{"content", caps_.requires_non_null_content? "" : j_null},
{"tool_calls", tool_calls},
};
};
@@ -195,9 +200,6 @@ class chat_template {
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
if (caps_.supports_tool_calls) {
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
@@ -234,7 +236,7 @@ class chat_template {
};
const json tool_call_msg {
{"role", "assistant"},
{"content", nullptr},
{"content", caps_.requires_non_null_content ? "" : j_null},
{"tool_calls", json::array({
{
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
+21 -3
View File
@@ -1355,8 +1355,13 @@ public:
case Op::Gt: return l > r;
case Op::Le: return l <= r;
case Op::Ge: return l >= r;
case Op::In: return (r.is_array() || r.is_object()) && r.contains(l);
case Op::NotIn: return !(r.is_array() && r.contains(l));
case Op::In: return (((r.is_array() || r.is_object()) && r.contains(l)) ||
(l.is_string() && r.is_string() &&
r.to_str().find(l.to_str()) != std::string::npos));
case Op::NotIn:
return !(((r.is_array() || r.is_object()) && r.contains(l)) ||
(l.is_string() && r.is_string() &&
r.to_str().find(l.to_str()) != std::string::npos));
default: break;
}
throw std::runtime_error("Unknown binary operator");
@@ -1552,6 +1557,19 @@ public:
else res[i] = std::tolower(res[i]);
}
return res;
} else if (method->get_name() == "replace") {
vargs.expectArgs("replace method", {2, 3}, {0, 0});
auto before = vargs.args[0].get<std::string>();
auto after = vargs.args[1].get<std::string>();
auto count = vargs.args.size() == 3 ? vargs.args[2].get<int64_t>()
: str.length();
size_t start_pos = 0;
while ((start_pos = str.find(before, start_pos)) != std::string::npos &&
count-- > 0) {
str.replace(start_pos, before.length(), after);
start_pos += after.length();
}
return str;
}
}
throw std::runtime_error("Unknown method: " + method->get_name());
@@ -2128,7 +2146,7 @@ private:
}
}
if ((has_first_colon || has_second_colon) && (start || end || step)) {
if ((has_first_colon || has_second_colon)) {
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
} else {
index = std::move(start);