ui : fix llama-ui-embed crash when no asset dir is given (#24597 )

Add arch support for cohere2-MoE (#24260 )
* Add arch support for cohere2-MoE * Removed redundant gating_func checks * Changed ffn lookup to prefer prefix_dense_intermediate_size * Renamed arch to cohere2moe * Removed redundant lmhead check and chat template changes * Removed lm_head.weight check from modify tensors, load output tensor not required, fallback to token_embd.weight * Changed to (routed+shared)*0.5 for shared expert combined avg * fixed sliding_window_pattern issue and pattern * Fixed transformers crash 'first_k_dense_replace' error * Remove comment * Removed cohere2-moe as a tokenizer type and kept as tiny_aya. Renamed North-Mini-Code-1.0. * Fixed MTP fail, changed to use iSWA * Fixed remaining todos: cohere2moe renamed, changed swa parsing to use get_key_or_arr, removed extra get_arr use * Force metadata usage Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove Cohere2 checkpoint comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove MTP comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Regenerate cohere2moe tokenizer hash * Add cohere2moe to Llama Model Saver supported list * Check for zerobios tensors and add support for Command to use LayerNorm * Map expert_selection_fn to sigmoid in base.py instead of command.py * use bools for foundnorm/foundnormrms Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-06-14 01:36:47 +02:00 · 2026-06-13 17:53:30 -05:00 · 2026-06-13 19:49:00 +02:00 · 2026-06-13 18:28:40 +02:00 · 2026-06-13 16:57:27 +02:00 · 2026-06-13 16:56:59 +02:00
55 changed files with 8389 additions and 5058 deletions
@@ -761,9 +761,9 @@ value member_expression::execute_impl(context & ctx) {

        if (is_stmt<slice_expression>(this->property)) {
            auto s = cast_stmt<slice_expression>(this->property);
-            value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
-            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : mk_val<value_int>(arr_size);
            value step_val  = s->step_expr  ? s->step_expr->execute(ctx)  : mk_val<value_int>(1);
+            value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
+            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));

            // translate to function call: obj.slice(start, stop, step)
            JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
            stop_val = std::min(stop_val, len);
        }
    } else {
-        start_val = len - 1;
+        start_val = start;
        if (start_val < 0) {
-            start_val = std::max(len + start_val, (int64_t)-1);
+            start_val = std::max(len + start_val, (int64_t)0);
        } else {
            start_val = std::min(start_val, len - 1);
        }

-        stop_val = -1;
+        stop_val = stop;
        if (stop_val < -1) {
            stop_val = std::max(len + stop_val, (int64_t)-1);
        } else {
@@ -673,6 +673,9 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
+            if (delim.empty()) {
+                throw raised_exception("empty separator");
+            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -697,6 +700,9 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
+            if (delim.empty()) {
+                throw raised_exception("empty separator");
+            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -722,10 +728,23 @@ const func_builtins & value_string_t::get_builtins() const {
            if (count > 0) {
                throw not_implemented_exception("String replace with count argument not implemented");
            }
-            size_t pos = 0;
-            while ((pos = str.find(old_str, pos)) != std::string::npos) {
-                str.replace(pos, old_str.length(), new_str);
-                pos += new_str.length();
+            if (old_str != new_str) {
+                size_t pos = 0;
+                if (old_str.empty()) {
+                    std::string new_res;
+                    new_res.reserve(str.length() + new_str.length() * (str.length() + 1));
+                    new_res += new_str;
+                    for (const char c : str) {
+                        new_res.push_back(c);
+                        new_res += new_str;
+                    }
+                    str = new_res;
+                } else {
+                    while ((pos = str.find(old_str, pos)) != std::string::npos) {
+                        str.replace(pos, old_str.length(), new_str);
+                        pos += new_str.length();
+                    }
+                }
            }
            auto res = mk_val<value_string>(str);
            res->val_str.mark_input_based_on(args.get_pos(0)->val_str);
@@ -40,6 +40,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "ChatGLMModel": "chatglm",
    "CodeShellForCausalLM": "codeshell",
    "CogVLMForCausalLM": "cogvlm",
+    "Cohere2MoeForCausalLM": "command_r",
    "Cohere2ForCausalLM": "command_r",
    "CohereForCausalLM": "command_r",
    "DbrxForCausalLM": "dbrx",
@@ -1195,7 +1195,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")

-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

@@ -1280,7 +1280,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_expert_group_used_count(n_group_used)
            logger.info(f"gguf: expert groups used count = {n_group_used}")

-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
            if score_func == "sigmoid":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
            elif score_func == "softmax":
@@ -1495,6 +1495,9 @@ class TextModel(ModelBase):
        if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
            # ref: https://huggingface.co/CohereLabs/tiny-aya-base
            res = "tiny_aya"
+        if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
+            # ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
+            res = "cohere2moe"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 from typing import Iterable, TYPE_CHECKING

 import torch
@@ -55,3 +56,122 @@ class Cohere2Model(TextModel):
            return

        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Cohere2MoeForCausalLM")
+class Cohere2MoeModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.COHERE2MOE
+    _n_main_layers: int | None = None
+    _expert_tensor_re = re.compile(
+        r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
+            self.block_count += n_nextn
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+        self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        expert_intermediate_size = hparams["intermediate_size"]
+        mlp_layer_types = hparams.get("mlp_layer_types")
+        n_dense_lead = hparams.get("first_k_dense_replace", 0)
+        if mlp_layer_types is not None:
+            n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
+
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_logit_scale(hparams["logit_scale"])
+        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+        self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+        self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
+        self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
+        if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
+            if hparams.get("shared_expert_combination_strategy", "average") != "average":
+                raise ValueError("Cohere2 MoE only supports average shared expert combination")
+            self.gguf_writer.add_expert_shared_count(num_shared_experts)
+            self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
+        if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
+            self.gguf_writer.add_nextn_predict_layers(n_nextn)
+        self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+    def index_tensors(self, remote_hf_model_id: str | None = None):
+        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
+        self._n_main_layers = hparams.get("num_hidden_layers")
+        type(self)._n_main_layers = self._n_main_layers
+        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
+
+    @classmethod
+    def filter_tensors(cls, item):
+        if (titem := super().filter_tensors(item)) is None:
+            return None
+        name, gen = titem
+
+        if cls._n_main_layers is not None:
+            is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
+            if is_mtp and cls.no_mtp:
+                return None
+            if cls.mtp_only and not is_mtp and name not in (
+                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
+            ):
+                return None
+
+        return name, gen
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith(".bias"):
+            if torch.any(data_torch != 0):
+                raise ValueError(f"Bias tensor {name!r} is not zero.")
+            logger.debug(f"Skipping bias tensor {name!r}.")
+            return
+
+        if (m := self._expert_tensor_re.fullmatch(name)) is not None:
+            n_experts = self.hparams["num_experts"]
+            layer_idx = int(m.group(1))
+            assert bid is None or bid == layer_idx
+
+            self._experts[layer_idx][name] = data_torch
+
+            expected = {
+                f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                for xid in range(n_experts)
+                for w_name in ("down_proj", "gate_proj", "up_proj")
+            }
+            if expected.issubset(self._experts[layer_idx]):
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[layer_idx][ename])
+                        del self._experts[layer_idx][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, layer_idx)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        experts = [k for d in self._experts for k in d.keys()]
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts}")
@@ -100,6 +100,7 @@ models = [
    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "tiny_aya",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
+    {"name": "cohere2moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -14,15 +14,15 @@ Legend:

 | Operation | BLAS | CANN | CPU | CUDA | MTL | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
@@ -41,25 +41,25 @@ Legend:
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -68,9 +68,9 @@ Legend:
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
@@ -79,27 +79,27 @@ Legend:
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -107,16 +107,16 @@ Legend:
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -833,6 +833,7 @@ struct vk_device_struct {

    // [src/dst 0=fp32,1=fp16]
    vk_pipeline pipeline_exp[2];
+    vk_pipeline pipeline_expm1[2];
    vk_pipeline pipeline_elu[2];
    vk_pipeline pipeline_gelu[2];
    vk_pipeline pipeline_gelu_erf[2];
@@ -1202,30 +1203,35 @@ struct vk_op_glu_push_constants {
    uint32_t mode;  // 0: default, 1: swapped, 2: split
    float alpha; // for swiglu_oai
    float limit;
+    uint32_t nb00;
    uint32_t nb01;
    uint32_t nb02;
    uint32_t nb03;
-    uint32_t ne01;
-    uint32_t ne02;
+    uint32_t nb10;
    uint32_t nb11;
    uint32_t nb12;
    uint32_t nb13;
-    uint32_t ne11;
-    uint32_t ne12;
+    uint32_t nb20;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t ne21;
+    uint32_t ne22;
+    uint32_t misalign_offsets;
+    uint32_t ne2_012mp; uint32_t ne2_012L;
+    uint32_t ne2_01mp;  uint32_t ne2_01L;
+    uint32_t ne2_0mp;   uint32_t ne2_0L;
 };
+static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128");

 struct vk_op_unary_push_constants {
    uint32_t ne;
    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
    uint32_t misalign_offsets;
-    float param1; float param2;
-    uint32_t ne0_012mp; uint32_t ne0_012L;
-    uint32_t ne0_01mp;  uint32_t ne0_01L;
-    uint32_t ne0_0mp;   uint32_t ne0_0L;
-    uint32_t ne1_012mp; uint32_t ne1_012L;
-    uint32_t ne1_01mp;  uint32_t ne1_01L;
-    uint32_t ne1_0mp;   uint32_t ne1_0L;
+    float param1; float param2; float param3; float param4;
+    uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls;
+    uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls;
 };
 static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");

@@ -1330,6 +1336,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
    mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
 }

+static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) {
+    return L0 | (L1 << 8) | (L2 << 16);
+}
+
 template <typename T> void init_pushconst_fastdiv(T &p) {
    GGML_UNUSED(p);
    static_assert(!std::is_const<T>::value, "unexpected type");
@@ -1337,12 +1347,29 @@ template <typename T> void init_pushconst_fastdiv(T &p) {

 template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
    // Compute magic values to divide by these six numbers.
-    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    p.ne0_012L);
-    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     p.ne0_01L);
-    init_fastdiv_values(p.ne00,                p.ne0_0mp,      p.ne0_0L);
-    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    p.ne1_012L);
-    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     p.ne1_01L);
-    init_fastdiv_values(p.ne10,                p.ne1_0mp,      p.ne1_0L);
+    uint32_t ne0_012L;
+    uint32_t ne0_01L;
+    uint32_t ne0_0L;
+    uint32_t ne1_012L;
+    uint32_t ne1_01L;
+    uint32_t ne1_0L;
+
+    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    ne0_012L);
+    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     ne0_01L);
+    init_fastdiv_values(p.ne00,                p.ne0_0mp,      ne0_0L);
+    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    ne1_012L);
+    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     ne1_01L);
+    init_fastdiv_values(p.ne10,                p.ne1_0mp,      ne1_0L);
+
+    p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L);
+    p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L);
+}
+
+template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) {
+    // GLU linearizes over dst, then uses dst coordinates for src0/src1.
+    init_fastdiv_values(p.ne22*p.ne21*p.ne20,  p.ne2_012mp,    p.ne2_012L);
+    init_fastdiv_values(p.ne21*p.ne20,         p.ne2_01mp,     p.ne2_01L);
+    init_fastdiv_values(p.ne20,                p.ne2_0mp,      p.ne2_0L);
 }

 struct vk_op_binary_push_constants {
@@ -5006,8 +5033,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

 #define CREATE_UNARY(name)  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);  \
-    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);  \
+    ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

    CREATE_UNARY(elu)
    CREATE_UNARY(gelu)
@@ -5030,6 +5057,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
    CREATE_UNARY(trunc)
    CREATE_UNARY(sgn)
    CREATE_UNARY(exp)
+    CREATE_UNARY(expm1)
 #undef CREATE_UNARY

    ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
@@ -8192,7 +8220,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
 static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) {
    VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
-    const int tensor_type_size = ggml_type_size(tensor->type);

    const uint32_t ne = ggml_nelements(tensor);
    std::array<uint32_t, 3> elements;
@@ -8205,14 +8232,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
        elements = { ne, 1, 1 };
    }

-    vk_op_unary_push_constants pc = {
-        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
-        0,
-        0.0f, 0.0f,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
+    vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
+    pc.nb10 = 1;
+    pc.nb11 = (uint32_t)tensor->ne[0];
+    pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]);
+    pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]);
    init_pushconst_fastdiv(pc);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
    ggml_vk_sync_buffers(ctx, subctx);
@@ -8226,7 +8250,6 @@ static void ggml_vk_cpy_to_strided(
        uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) {
    VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
-    const int tensor_type_size = ggml_type_size(tensor->type);

    const uint32_t ne = ggml_nelements(tensor);
    std::array<uint32_t, 3> elements;
@@ -8239,14 +8262,11 @@ static void ggml_vk_cpy_to_strided(
        elements = { ne, 1, 1 };
    }

-    vk_op_unary_push_constants pc = {
-        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], nb10, nb11, nb12, nb13,
-        0,
-        0.0f, 0.0f,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    };
+    vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne);
+    pc.nb10 = nb10;
+    pc.nb11 = nb11;
+    pc.nb12 = nb12;
+    pc.nb13 = nb13;
    init_pushconst_fastdiv(pc);
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
    ggml_vk_sync_buffers(ctx, subctx);
@@ -10451,6 +10471,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        switch (ggml_get_unary_op(dst)) {
            case GGML_UNARY_OP_EXP:
                return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16];
+            case GGML_UNARY_OP_EXPM1:
+                return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_ELU:
                return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_SILU:
@@ -10849,6 +10871,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
    GGML_UNUSED(src3);
 }

+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset;
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+
+    GGML_ASSERT(a_offset < (1u << 8));
+    GGML_ASSERT(b_offset < (1u << 8));
+    GGML_ASSERT(d_offset < (1u << 8));
+
+    p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
+
+    GGML_UNUSED(src2);
+    GGML_UNUSED(src3);
+}
+
 template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
@@ -12198,17 +12235,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c
 }

 static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst));
 }

 static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY,
-        {
-            (uint32_t)ggml_nelements(src0), 0,
-            op_params[1], op_params[2], op_params[3], op_params[4]
-        }
-    );
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
+    p.param1 = op_params[1];
+    p.param2 = op_params[2];
+    p.param3 = op_params[3];
+    p.param4 = op_params[4];
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p));
 }

 static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -12228,6 +12265,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
    }

    const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size;
+    const uint32_t dst_type_size  = ggml_type_size(dst->type);

    ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU,
        {
@@ -12237,16 +12277,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
            mode,
            alpha,
            limit,
-            (uint32_t)(src0->nb[1] / src0->nb[0]),
-            (uint32_t)(src0->nb[2] / src0->nb[0]),
-            (uint32_t)(src0->nb[3] / src0->nb[0]),
-            (uint32_t)src0->ne[1],
-            (uint32_t)src0->ne[2],
-            (uint32_t)(dst->nb[1] / dst->nb[0]),
-            (uint32_t)(dst->nb[2] / dst->nb[0]),
-            (uint32_t)(dst->nb[3] / dst->nb[0]),
+            (uint32_t)(src0->nb[0] / src0_type_size),
+            (uint32_t)(src0->nb[1] / src0_type_size),
+            (uint32_t)(src0->nb[2] / src0_type_size),
+            (uint32_t)(src0->nb[3] / src0_type_size),
+            (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size),
+            (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size),
+            (uint32_t)(dst->nb[0] / dst_type_size),
+            (uint32_t)(dst->nb[1] / dst_type_size),
+            (uint32_t)(dst->nb[2] / dst_type_size),
+            (uint32_t)(dst->nb[3] / dst_type_size),
            (uint32_t)dst->ne[1],
-            (uint32_t)dst->ne[2]
+            (uint32_t)dst->ne[2],
+            0,
+            0, 0, 0, 0, 0, 0,
        });
 }

@@ -14249,6 +14295,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_ELU:
        case GGML_UNARY_OP_EXP:
+        case GGML_UNARY_OP_EXPM1:
        case GGML_UNARY_OP_SILU:
        case GGML_UNARY_OP_GELU:
        case GGML_UNARY_OP_GELU_ERF:
@@ -16638,6 +16685,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_EXPM1:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_ERF:
@@ -16658,8 +16706,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_UNARY_OP_FLOOR:
                case GGML_UNARY_OP_TRUNC:
                case GGML_UNARY_OP_SGN:
-                    return ggml_is_contiguous(op->src[0]) &&
-                           (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
+                    return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
                           (op->src[0]->type == op->type);
                default:
@@ -16675,7 +16722,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_GLU_OP_GEGLU_QUICK:
                    return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
-                           (op->src[0]->type == op->type);
+                           (op->src[0]->type == op->type) &&
+                           (!op->src[1] || op->src[1]->type == op->src[0]->type);
                default:
                    return false;
            }
@@ -17805,6 +17853,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            case GGML_UNARY_OP_EXP:
                tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
                break;
+            case GGML_UNARY_OP_EXPM1:
+                tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]);
+                break;
            case GGML_UNARY_OP_ELU:
                tensor_clone = ggml_elu(ggml_ctx, src_clone[0]);
                break;
@@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(abs(float(data_a[i])));
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(ceil(x));
-}
@@ -12,11 +12,11 @@ void main() {
        return;
    }

-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;

    if (i10 == i11) {
@@ -1,27 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    float x = float(data_a[i]);
-
-    if (x < 0.0f) {
-        x = exp(x) - 1;
-    }
-
-    data_d[i] = D_TYPE(x);
-}
@@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(exp(float(data_a[i])));
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(floor(x));
-}
@@ -1,25 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
-    data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
-}
@@ -1,39 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
-    // ref: https://www.johndcook.com/blog/python_erf/
-    const float p_erf  = 0.3275911f;
-    const float a1_erf = 0.254829592f;
-    const float a2_erf = -0.284496736f;
-    const float a3_erf = 1.421413741f;
-    const float a4_erf = -1.453152027f;
-    const float a5_erf = 1.061405429f;
-
-    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float a = float(data_a[i]);
-    const float a_div_sqr2 = a * SQRT_2_INV;
-    const float sign_x = sign(a_div_sqr2);
-    const float x = abs(a_div_sqr2);
-    const float t = 1.0f / (1.0f + p_erf * x);
-    const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
-    const float erf_approx = sign_x * y;
-
-    data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx));
-}
@@ -1,23 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const float GELU_QUICK_COEF = -1.702f;
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
-}
@@ -7,14 +7,12 @@ layout (push_constant) uniform parameter
    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint misalign_offsets;
-    float param1; float param2;
+    float param1; float param2; float param3; float param4;

-    uint ne0_012mp; uint ne0_012L;
-    uint ne0_01mp;  uint ne0_01L;
-    uint ne0_0mp;   uint ne0_0L;
-    uint ne1_012mp; uint ne1_012L;
-    uint ne1_01mp;  uint ne1_01L;
-    uint ne1_0mp;   uint ne1_0L;
+    // The three L values are packed as bytes to keep this layout under the 128B
+    // push constant limit while still leaving room for four float parameters.
+    uint ne0_012mp; uint ne0_01mp;  uint ne0_0mp;  uint ne0_Ls;
+    uint ne1_012mp; uint ne1_01mp;  uint ne1_0mp;  uint ne1_Ls;
 } p;

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) {
    return (msbs + n) >> L;
 }

+uint fastdiv_L(uint packed, uint slot) {
+    return (packed >> (slot * 8)) & 0x3Fu;
+}
+
 uint src0_idx(uint idx) {
-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }

 uint dst_idx(uint idx) {
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
 }

 uint src0_idx_quant(uint idx, uint qk) {
-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
 }

 uint dst_idx_quant(uint idx, uint qk) {
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
 }
@@ -15,14 +15,33 @@ layout (push_constant) uniform parameter
    uint mode;
    float alpha;
    float limit;
+    uint nb00;
    uint nb01;
    uint nb02;
    uint nb03;
-    uint ne01;
-    uint ne02;
+    uint nb10;
    uint nb11;
    uint nb12;
    uint nb13;
-    uint ne11;
-    uint ne12;
+    uint nb20;
+    uint nb21;
+    uint nb22;
+    uint nb23;
+    uint ne21;
+    uint ne22;
+    uint misalign_offsets;
+    uint ne2_012mp; uint ne2_012L;
+    uint ne2_01mp;  uint ne2_01L;
+    uint ne2_0mp;   uint ne2_0L;
 } p;
+
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
+uint get_doffset() { return p.misalign_offsets & 0xFF; }
+
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
@@ -5,35 +5,31 @@ void main() {
        return;
    }

-    const uint row = i / p.ne20;
-    const uint col = i - row * p.ne20;
+    const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L);
+    const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
+    const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L);
+    const uint i22_offset = i22*p.ne21*p.ne20;
+    const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L);
+    const uint i20 = i - i23_offset - i22_offset - i21*p.ne20;

-    const uint i3 = row / (p.ne01 * p.ne02);
-    const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01;
-    const uint i1 = row % p.ne01;
-    const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col;
-
-    const uint dst_i3 = row / (p.ne11 * p.ne12);
-    const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11;
-    const uint dst_i1 = row % p.ne11;
-    const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col;
+    const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00;
+    const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10;
+    const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20;

    if (p.mode == 0) {
        // Default
-        const uint offset = p.ne00 / 2;
-        const uint idx = src_idx;
+        const uint offset = (p.ne00 / 2) * p.nb00;
+        const uint idx = src_idx_a;

        data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
    } else if (p.mode == 1) {
        // Swapped
-        const uint offset = p.ne00 / 2;
-        const uint idx = src_idx;
+        const uint offset = (p.ne00 / 2) * p.nb00;
+        const uint idx = src_idx_a;

        data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
    } else {
        // Split
-        const uint idx = src_idx;
-
-        data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
+        data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b])));
    }
 }
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)));
-}
@@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(-float(data_a[i]));
-}
@@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(max(float(data_a[i]), 0));
-}
@@ -13,11 +13,11 @@ void main() {
    }

    // Destination multi-index (inlined dst_idx)
-    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
-    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i12_offset = i12*p.ne11*p.ne10;
-    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    const uint d_idx = i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;

@@ -20,11 +20,11 @@ void main() {
        return;
    }

-    const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i3 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0));
    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
-    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1));
    const uint i2_offset = i2*p.ne11*p.ne10;
-    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2));
    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;

    const uint p1 = floatBitsToUint(p.param1);
@@ -1,29 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    float result;
-    // Round halfway cases away from zero as roundf does.
-    if (x >= 0.0) {
-        result = floor(x + 0.5);
-    } else {
-        result = ceil(x - 0.5);
-    }
-    data_d[i] = D_TYPE(result);
-}
@@ -1,21 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = D_TYPE(sign(float(data_a[i])));
-}
@@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i]))));
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
-}
@@ -1,23 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    const float result = (x > 20.0f) ? x : log(1.0f + exp(x));
-    data_d[i] = D_TYPE(result);
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f);
-}
@@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-    data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.));
-}
@@ -17,11 +17,11 @@ void main() {
        return;
    }

-    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
-    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1));
    const uint i02_offset = i02*p.ne01*p.ne00;
-    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2));
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;

    int param = floatBitsToInt(p.param1);
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(trunc(x));
-}
@@ -0,0 +1,144 @@
+#version 450
+
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+float op_abs(float x) {
+    return abs(x);
+}
+
+float op_sgn(float x) {
+    return sign(x);
+}
+
+float op_neg(float x) {
+    return -x;
+}
+
+float op_step(float x) {
+    return x >= 0.0f ? 1.0f : 0.0f;
+}
+
+float op_tanh(float x) {
+    return 1.0f - 2.0f / (exp(2.0f*x) + 1.0f);
+}
+
+float op_elu(float x) {
+    return x < 0.0f ? exp(x) - 1.0f : x;
+}
+
+float op_relu(float x) {
+    return max(x, 0.0f);
+}
+
+float op_sigmoid(float x) {
+    return 1.0f / (1.0f + exp(-x));
+}
+
+float op_gelu(float x) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const float val = SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x);
+    return 0.5f*x*(2.0f - 2.0f / (exp(2.0f * val) + 1.0f));
+}
+
+float op_gelu_quick(float x) {
+    const float GELU_QUICK_COEF = -1.702f;
+    return x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)));
+}
+
+float op_silu(float x) {
+    return x / (1.0f + exp(-x));
+}
+
+float op_hardswish(float x) {
+    return x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f));
+}
+
+float op_hardsigmoid(float x) {
+    return min(1.0f, max(0.0f, (x + 3.0f) / 6.0f));
+}
+
+float op_exp(float x) {
+    return exp(x);
+}
+
+float op_expm1(float x) {
+    // exp(x) - 1 loses many ulps to cancellation near zero.  Use a degree-6
+    // Taylor expansion for |x| <= 1/4: the omitted x^7/5040 term is < 1.3e-8,
+    // about 0.5 ulp at expm1(0.25), and a host-side f32 model stays within
+    // 2 ulps over the interval.  The first native exp(x)-1 values outside the
+    // cutoff are about 1 ulp for +0.25 and 2 ulps for -0.25.
+    if (abs(x) <= 0.25f) {
+        return x * (1.0f + x * (0.5f + x * ((1.0f/6.0f) + x * ((1.0f/24.0f) + x * ((1.0f/120.0f) + x * (1.0f/720.0f))))));
+    }
+    return exp(x) - 1.0f;
+}
+
+float op_softplus(float x) {
+    return (x > 20.0f) ? x : log(1.0f + exp(x));
+}
+
+float op_gelu_erf(float a) {
+    // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
+    const float p_erf  = 0.3275911f;
+    const float a1_erf = 0.254829592f;
+    const float a2_erf = -0.284496736f;
+    const float a3_erf = 1.421413741f;
+    const float a4_erf = -1.453152027f;
+    const float a5_erf = 1.061405429f;
+
+    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
+    const float a_div_sqr2 = a * SQRT_2_INV;
+    const float sign_x = sign(a_div_sqr2);
+    const float x = abs(a_div_sqr2);
+    const float t = 1.0f / (1.0f + p_erf * x);
+    const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
+    return 0.5f * a * (1.0f + sign_x * y);
+}
+
+float op_xielu(float x) {
+    const float alpha_n = p.param1;
+    const float alpha_p = p.param2;
+    const float beta = p.param3;
+    const float eps = p.param4;
+
+    if (x > 0.0f) {
+        return alpha_p * x * x + beta * x;
+    }
+
+    const float min_x_eps = min(x, eps);
+    return (op_expm1(min_x_eps) - x) * alpha_n + beta * x;
+}
+
+float op_floor(float x) {
+    return floor(x);
+}
+
+float op_ceil(float x) {
+    return ceil(x);
+}
+
+float op_round(float x) {
+    // Round halfway cases away from zero as roundf does.
+    return x >= 0.0f ? floor(x + 0.5f) : ceil(x - 0.5f);
+}
+
+float op_trunc(float x) {
+    return trunc(x);
+}
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint a_idx = get_aoffset() + src0_idx(idx);
+    const uint d_idx = get_doffset() + dst_idx(idx);
+
+    data_d[d_idx] = D_TYPE(OP(float(data_a[a_idx])));
+}
@@ -868,47 +868,49 @@ void process_shaders() {

    string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});

-    string_to_spv("exp_f16",        "exp.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("exp_f32",        "exp.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("exp_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_exp"}});
+    string_to_spv("exp_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_exp"}});
+    string_to_spv("expm1_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_expm1"}});
+    string_to_spv("expm1_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_expm1"}});

    string_to_spv("log_f16",        "log.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("log_f32",        "log.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_f16",       "gelu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_f32",       "gelu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_erf_f16",   "gelu_erf.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_erf_f32",   "gelu_erf.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("gelu_quick_f16", "gelu_quick.comp",  {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("gelu_quick_f32", "gelu_quick.comp",  {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("silu_f16",       "silu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("silu_f32",       "silu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("relu_f16",       "relu.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("relu_f32",       "relu.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("neg_f16",        "neg.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("neg_f32",        "neg.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("tanh_f16",       "tanh.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("tanh_f32",       "tanh.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("sigmoid_f16",    "sigmoid.comp",     {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("sigmoid_f32",    "sigmoid.comp",     {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("hardsigmoid_f16","hardsigmoid.comp", {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("hardsigmoid_f32","hardsigmoid.comp", {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("hardswish_f16",  "hardswish.comp",   {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("hardswish_f32",  "hardswish.comp",   {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("abs_f16",        "abs.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("abs_f32",        "abs.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("elu_f16",        "elu.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("elu_f32",        "elu.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("xielu_f16",      "xielu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("xielu_f32",      "xielu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("sgn_f16",        "sgn.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("sgn_f32",        "sgn.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("gelu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu"}});
+    string_to_spv("gelu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu"}});
+    string_to_spv("gelu_erf_f16",   "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu_erf"}});
+    string_to_spv("gelu_erf_f32",   "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu_erf"}});
+    string_to_spv("gelu_quick_f16", "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_gelu_quick"}});
+    string_to_spv("gelu_quick_f32", "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_gelu_quick"}});
+    string_to_spv("silu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_silu"}});
+    string_to_spv("silu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_silu"}});
+    string_to_spv("relu_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_relu"}});
+    string_to_spv("relu_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_relu"}});
+    string_to_spv("neg_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_neg"}});
+    string_to_spv("neg_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_neg"}});
+    string_to_spv("tanh_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}});
+    string_to_spv("tanh_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_tanh"}});
+    string_to_spv("sigmoid_f16",    "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_sigmoid"}});
+    string_to_spv("sigmoid_f32",    "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_sigmoid"}});
+    string_to_spv("hardsigmoid_f16","unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_hardsigmoid"}});
+    string_to_spv("hardsigmoid_f32","unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_hardsigmoid"}});
+    string_to_spv("hardswish_f16",  "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_hardswish"}});
+    string_to_spv("hardswish_f32",  "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_hardswish"}});
+    string_to_spv("abs_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_abs"}});
+    string_to_spv("abs_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_abs"}});
+    string_to_spv("elu_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_elu"}});
+    string_to_spv("elu_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_elu"}});
+    string_to_spv("xielu_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_xielu"}});
+    string_to_spv("xielu_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_xielu"}});
+    string_to_spv("sgn_f16",        "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_sgn"}});
+    string_to_spv("sgn_f32",        "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_sgn"}});

    string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
    string_to_spv("diag_f16",       "diag.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("diag_f32",       "diag.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});

-    string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("softplus_f16",   "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_softplus"}});
+    string_to_spv("softplus_f32",   "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_softplus"}});

    string_to_spv("add1_f16_f16",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("add1_f16_f32",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
@@ -916,16 +918,16 @@ void process_shaders() {
    string_to_spv("arange_f32",     "arange.comp",      {{"A_TYPE", "float"},       {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("fill_f32",       "fill.comp",        {{"D_TYPE", "float"},       {"FLOAT_TYPE", "float"}});
    string_to_spv("fill_f16",       "fill.comp",        {{"D_TYPE", "float16_t"},   {"FLOAT_TYPE", "float"}});
-    string_to_spv("step_f16",       "step.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("step_f32",       "step.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("round_f16",      "round.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("round_f32",      "round.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("ceil_f16",       "ceil.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("ceil_f32",       "ceil.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("floor_f16",      "floor.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("floor_f32",      "floor.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("trunc_f16",      "trunc.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("trunc_f32",      "trunc.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("step_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_step"}});
+    string_to_spv("step_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_step"}});
+    string_to_spv("round_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_round"}});
+    string_to_spv("round_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_round"}});
+    string_to_spv("ceil_f16",       "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_ceil"}});
+    string_to_spv("ceil_f32",       "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_ceil"}});
+    string_to_spv("floor_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_floor"}});
+    string_to_spv("floor_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_floor"}});
+    string_to_spv("trunc_f16",      "unary.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}, {"OP", "op_trunc"}});
+    string_to_spv("trunc_f32",      "unary.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"},     {"OP", "op_trunc"}});

    string_to_spv("geglu_f16",      "geglu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("geglu_f32",      "geglu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
@@ -1,35 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    float x = float(data_a[i]);
-
-    float alpha_n = p.param1;
-    float alpha_p = p.param2;
-    float beta = p.param3;
-    float eps = p.param4;
-
-    if (x > 0.0f) {
-        x = alpha_p * x * x + beta * x;
-    } else {
-        const float min_x_eps = min(x, eps);
-        x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x;
-    }
-
-    data_d[i] = D_TYPE(x);
-}
@@ -457,6 +457,7 @@ class MODEL_ARCH(IntEnum):
    XVERSE           = auto()
    COMMAND_R        = auto()
    COHERE2          = auto()
+    COHERE2MOE       = auto()
    DBRX             = auto()
    OLMO             = auto()
    OLMO2            = auto()
@@ -1012,6 +1013,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.XVERSE:           "xverse",
    MODEL_ARCH.COMMAND_R:        "command-r",
    MODEL_ARCH.COHERE2:          "cohere2",
+    MODEL_ARCH.COHERE2MOE:       "cohere2moe",
    MODEL_ARCH.DBRX:             "dbrx",
    MODEL_ARCH.OLMO:             "olmo",
    MODEL_ARCH.OLMO2:            "olmo2",
@@ -2872,6 +2874,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.COHERE2MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_GATE_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
+    ],
    MODEL_ARCH.DBRX: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -16,23 +16,7 @@ set(HF_VERSION        "" CACHE STRING "Version to download (empty = resolve from
 set(HF_ENABLED        "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)")
 set(BUILD_UI          "" CACHE STRING "Build UI via npm (ON/OFF)")
 set(LLAMA_UI_EMBED    "" CACHE STRING "Path to llama-ui-embed helper")
-
-# IMPORTANT: When adding PWA assets, sync:
-#   - tools/ui/src/lib/constants/pwa.ts   (APPLE_DEVICES, PWA_MANIFEST)
-#
-# The HTTP server registers routes and public endpoints for every embedded asset.
-set(REQUIRED_ASSETS
-    index.html
-    loading.html
-    manifest.webmanifest
-    sw.js
-    build.json
-    # post-build.js flattens and dehashes these to fixed names in the dist dir
-    bundle.js
-    bundle.css
-    workbox.js
-    version.json
-)
+set(LLAMA_UI_GZIP     "" CACHE STRING "Apply gzip compress to assets to save bandwidth")

 set(DIST_DIR     "${UI_BINARY_DIR}/dist")
 set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist")
@@ -40,22 +24,10 @@ set(STAMP_FILE   "${UI_BINARY_DIR}/.ui-stamp")
 set(UI_CPP       "${UI_BINARY_DIR}/ui.cpp")
 set(UI_H         "${UI_BINARY_DIR}/ui.h")

-function(assets_present dir out_var)
-    set(present TRUE)
-    foreach(asset ${REQUIRED_ASSETS})
-        if(NOT EXISTS "${dir}/${asset}")
-            set(present FALSE)
-            break()
-        endif()
-    endforeach()
-    set(${out_var} ${present} PARENT_SCOPE)
-endfunction()
-
 function(npm_build_should_skip out_var)
    set(${out_var} FALSE PARENT_SCOPE)

-    assets_present("${DIST_DIR}" present)
-    if(NOT present)
+    if(NOT EXISTS "${DIST_DIR}/index.html")
        return()
    endif()

@@ -162,8 +134,7 @@ function(npm_build out_var)
        return()
    endif()

-    assets_present("${DIST_DIR}" present)
-    if(NOT present)
+    if(NOT EXISTS "${DIST_DIR}/index.html")
        message(STATUS "UI: npm build finished but assets missing in ${DIST_DIR}")
        return()
    endif()
@@ -242,8 +213,7 @@ function(hf_download version out_var out_resolved)

        file(ARCHIVE_EXTRACT INPUT "${archive}" DESTINATION "${DIST_DIR}")

-        assets_present("${DIST_DIR}" present)
-        if(NOT present)
+        if(NOT EXISTS "${DIST_DIR}/index.html")
            message(STATUS "UI: archive from ${resolved} is missing required assets")
            continue()
        endif()
@@ -256,11 +226,35 @@ function(hf_download version out_var out_resolved)
 endfunction()

 function(emit_files dist_dir)
-    assets_present("${dist_dir}" present)
+    # If gzip is requested, compress every asset into a parallel _gzip/ tree
+    # the structure stays the same; for ex: /abc/def --> /_gzip/abc/def
+    # embed.cpp will check for _gzip and will pick it up
+    if(LLAMA_UI_GZIP AND EXISTS "${dist_dir}/index.html")
+        find_program(GZIP_EXECUTABLE gzip)
+        if(NOT GZIP_EXECUTABLE)
+            message(WARNING "UI: LLAMA_UI_GZIP requested but gzip not found, embedding uncompressed")
+        else()
+            set(gzip_dir "${dist_dir}/_gzip")
+            file(REMOVE_RECURSE "${gzip_dir}")
+            file(GLOB_RECURSE all_files RELATIVE "${dist_dir}" "${dist_dir}/*")
+            foreach(f ${all_files})
+                get_filename_component(dst_dir "${gzip_dir}/${f}" DIRECTORY)
+                file(MAKE_DIRECTORY "${dst_dir}")
+                execute_process(
+                    COMMAND "${GZIP_EXECUTABLE}" -c "${dist_dir}/${f}"
+                    OUTPUT_FILE "${gzip_dir}/${f}"
+                    RESULT_VARIABLE gz_rc
+                )
+                if(NOT gz_rc EQUAL 0)
+                    message(FATAL_ERROR "UI: gzip failed for ${f}")
+                endif()
+            endforeach()
+            message(STATUS "UI: gzip compression applied (${gzip_dir})")
+        endif()
+    endif()

    set(args "${UI_CPP}" "${UI_H}")
-    if(present)
-        # llama-ui-embed embeds every top-level file in dist_dir
+    if(EXISTS "${dist_dir}/index.html")
        list(APPEND args "${dist_dir}")
    endif()

@@ -276,8 +270,7 @@ endfunction()
 # ---------------------------------------------------------------------------
 # 1. Priority 1: pre-built assets supplied in tools/ui/dist
 # ---------------------------------------------------------------------------
-assets_present("${SRC_DIST_DIR}" SRC_OK)
-if(SRC_OK)
+if(EXISTS "${SRC_DIST_DIR}/index.html")
    message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}")
    emit_files("${SRC_DIST_DIR}")
    return()
@@ -312,7 +305,10 @@ if(NOT provisioned AND HF_ENABLED)
        endif()
    endif()

-    assets_present("${DIST_DIR}" have_assets)
+    set(have_assets FALSE)
+    if(EXISTS "${DIST_DIR}/index.html")
+        set(have_assets TRUE)
+    endif()
    if(stamp_ok AND have_assets)
        message(STATUS "UI: HF stamp '${stamped}' matches version, skipping HF fetch")
        set(provisioned TRUE)
@@ -332,8 +328,7 @@ endif()
 # 4. Fallback: warn about stale or missing assets, then emit whatever we have
 # ---------------------------------------------------------------------------
 if(NOT provisioned)
-    assets_present("${DIST_DIR}" have_assets)
-    if(have_assets)
+    if(EXISTS "${DIST_DIR}/index.html")
        message(WARNING "UI: provisioning failed; embedding stale assets from ${DIST_DIR}")
    else()
        message(WARNING "UI: no assets available - building without an embedded UI. "
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_XVERSE,           "xverse"           },
    { LLM_ARCH_COMMAND_R,        "command-r"        },
    { LLM_ARCH_COHERE2,          "cohere2"          },
+    { LLM_ARCH_COHERE2MOE,       "cohere2moe"       },
    { LLM_ARCH_DBRX,             "dbrx"             },
    { LLM_ARCH_OLMO,             "olmo"             },
    { LLM_ARCH_OLMO2,            "olmo2"            },
@@ -71,6 +71,7 @@ enum llm_arch {
    LLM_ARCH_XVERSE,
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_COHERE2,
+    LLM_ARCH_COHERE2MOE,
    LLM_ARCH_DBRX,
    LLM_ARCH_OLMO,
    LLM_ARCH_OLMO2,
@@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
        case LLM_ARCH_GEMMA3:
        case LLM_ARCH_GEMMA3N:
        case LLM_ARCH_COHERE2:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_OLMO2:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_T5:
@@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
            return new llama_model_command_r(params);
        case LLM_ARCH_COHERE2:
            return new llama_model_cohere2(params);
+        case LLM_ARCH_COHERE2MOE:
+            return new llama_model_cohere2moe(params);
        case LLM_ARCH_DBRX:
            return new llama_model_dbrx(params);
        case LLM_ARCH_OLMO:
@@ -1467,9 +1469,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    }
    ml.done_getting_tensors();

+    // Tied NVFP4 output is valid when no separate LM-head scale tensors are present.
+    // If sidecar scales exist, the output weight must be an actual output tensor.
    GGML_ASSERT(!(output && tok_embd &&
            strcmp(output->name, tok_embd->name) == 0 &&
-            output->type == GGML_TYPE_NVFP4));
+            output->type == GGML_TYPE_NVFP4 &&
+            (output_s || output_in_s)));
    // populate tensors_by_name
    for (auto & [_, ctx_ptr] : ml.ctx_map) {
        for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
@@ -1844,6 +1849,7 @@ void llama_model::print_info() const {
        }

        if (arch == LLM_ARCH_MELLUM ||
+                arch == LLM_ARCH_COHERE2MOE ||
                arch == LLM_ARCH_QWEN3MOE ||
                arch == LLM_ARCH_OPENAI_MOE ||
                arch == LLM_ARCH_QWEN3VLMOE ||
@@ -2389,6 +2395,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_XVERSE:
        case LLM_ARCH_COMMAND_R:
        case LLM_ARCH_COHERE2:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_OLMO:
        case LLM_ARCH_ARCTIC:
        case LLM_ARCH_DEEPSEEK:
@@ -122,9 +122,9 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
        // feed-forward network
        {
            cur = build_ffn(ffn_inp,
-                    model.layers[il].ffn_up, NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
+                    model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+                    model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        }
@@ -0,0 +1,443 @@
+#include "models.h"
+
+void llama_model_cohere2moe::load_arch_hparams(llama_model_loader & ml) {
+    const bool found_norm     = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps,     false);
+    const bool found_norm_rms = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+    if (!found_norm && !found_norm_rms) {
+        throw std::runtime_error("missing Cohere2 MoE norm epsilon");
+    }
+    if (!found_norm_rms) {
+        hparams.f_norm_rms_eps = 0.0f;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
+
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    uint32_t swa_period = 4;
+    if (ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false)) {
+        hparams.set_swa_pattern(swa_period, true);
+    } else {
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
+    }
+
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+
+    switch (hparams.n_layer()) {
+        case 49: type = LLM_TYPE_30B_A3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_cohere2moe::load_arch_tensors(llama_model_loader & ml) {
+    LLAMA_LOAD_LOCALS;
+
+    const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP
+    // tensors live in a separate file. Mark MTP tensors NOT_REQUIRED so the
+    // trunk loads cleanly.
+    const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight";
+    const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr);
+    const int trunk_flags = mtp_only  ? TENSOR_NOT_REQUIRED : 0;
+    const int mtp_flags   = trunk_only ? TENSOR_NOT_REQUIRED : 0;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    if (n_expert == 0) {
+        throw std::runtime_error("n_expert must be > 0 for Cohere2Moe");
+    }
+    if (n_expert_used == 0) {
+        throw std::runtime_error("n_expert_used must be > 0 for Cohere2Moe");
+    }
+
+    auto load_block_trunk = [&](int i, int flags) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+        if (static_cast<uint32_t>(i) < hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
+        } else {
+            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff;
+
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags);
+
+            if (hparams.n_expert_shared > 0) {
+                const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared;
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), { n_embd, n_ff_shexp }, flags);
+            }
+        }
+    };
+
+    auto load_block_mtp = [&](int i, int flags) {
+        auto & layer = layers[i];
+
+        // MTP block looks like a full-attention Cohere2 MoE decoder block.
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff;
+
+        // Routed experts
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, flags);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags);
+
+        if (hparams.n_expert_shared > 0) {
+            const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared;
+
+            // Shared experts
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), { n_embd, n_ff_shexp }, flags);
+        }
+
+        // NextN-specific tensors that define the MTP block.
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, flags);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              flags);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              flags);
+        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
+    };
+
+    for (int i = 0; i < n_layer; ++i) {
+        load_block_trunk(i, trunk_flags);
+    }
+    // MTP/NextN layers are loaded as extra decoder blocks.
+    for (int i = n_layer; i < n_layer_all; ++i) {
+        load_block_mtp(i, mtp_flags);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_cohere2moe::build_arch_graph(const llm_graph_params & params) const {
+    if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
+        return std::make_unique<graph_mtp>(*this, params);
+    }
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_cohere2moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS;
+    const float f_logit_scale = hparams.f_logit_scale;
+    ggml_tensor * cur;
+    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv_iswa();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
+    for (int il = 0; il < n_layer; ++il) {
+        const bool is_swa = hparams.is_swa(il);
+        // Dense-prefix full-attention layers use RoPE; later layers follow the SWA pattern.
+        const bool force_rope = static_cast<uint32_t>(il) < hparams.n_layer_dense_lead;
+
+        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, cohere2moe_norm_type, il);
+        cb(cur, "attn_norm", il);
+
+        ggml_tensor * ffn_inp = cur;
+
+        {
+            const auto & layer = model.layers[il];
+
+            auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur,
+                    n_embd_head, n_head, n_head_kv, il);
+
+            if (is_swa || force_rope) {
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    layer.wo, layer.wo_b, layer.wo_s,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+                    1.0f / sqrtf(float(n_embd_head)), il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
+            cur     = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpL    = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+        }
+
+        ggml_tensor * attn_out = cur;
+
+        const auto & layer = model.layers[il];
+
+        if (layer.ffn_gate_inp == nullptr) {
+            cur = build_ffn(ffn_inp,
+                    layer.ffn_up,   nullptr, layer.ffn_up_s,
+                    layer.ffn_gate, nullptr, layer.ffn_gate_s,
+                    layer.ffn_down, nullptr, layer.ffn_down_s,
+                    nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            cur = build_moe_ffn(ffn_inp,
+                    layer.ffn_gate_inp,
+                    layer.ffn_up_exps,
+                    layer.ffn_gate_exps,
+                    layer.ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, hparams.expert_weights_norm,
+                    hparams.expert_weights_scale,
+                    (llama_expert_gating_func_type) hparams.expert_gating_func,
+                    il,
+                    nullptr, layer.ffn_gate_up_exps,
+                    layer.ffn_up_exps_s,
+                    layer.ffn_gate_exps_s,
+                    layer.ffn_down_exps_s);
+            cb(cur, "ffn_moe_out", il);
+
+            if (layer.ffn_up_shexp) {
+                ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+                        layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
+                        layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
+                        layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
+                        nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+
+                cur = ggml_add(ctx0, cur, ffn_shexp);
+                cur = ggml_scale(ctx0, cur, 0.5f);
+                cb(cur, "ffn_out", il);
+            }
+        }
+
+        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx0, cur, attn_out);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, nullptr, cohere2moe_norm_type, -1);
+
+    cb(cur, "h_nextn", -1);
+    res->t_h_nextn = cur;
+
+    if (!cparams.embeddings_nextn_masked && inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    cur = build_lora_mm(model.output, cur);
+
+    if (f_logit_scale) {
+        cur = ggml_scale(ctx0, cur, f_logit_scale);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+llama_model_cohere2moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    GGML_ASSERT(hparams.n_layer_nextn > 0 && "COHERE2MOE MTP requires n_layer_nextn > 0");
+    GGML_ASSERT(hparams.n_layer_nextn == 1 && "COHERE2MOE MTP currently only supports a single MTP block");
+
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const int il = hparams.n_layer();
+    const auto & layer = model.layers[il];
+    GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
+    GGML_ASSERT(layer.nextn.enorm   && "MTP block missing nextn.enorm");
+    GGML_ASSERT(layer.nextn.hnorm   && "MTP block missing nextn.hnorm");
+    GGML_ASSERT(layer.ffn_gate_inp  && "MTP block missing ffn_gate_inp");
+
+    const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS;
+
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
+    ggml_set_input(inp->embd);
+
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
+    cb(tok_embd, "mtp_tok_embd", il);
+
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
+    res->add_input(std::move(inp));
+
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn = build_attn_inp_kv_iswa();
+
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, cohere2moe_norm_type, il);
+    cb(h_norm, "mtp_hnorm", il);
+
+    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, cohere2moe_norm_type, il);
+    cb(e_norm, "mtp_enorm", il);
+
+    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
+    cb(concat, "mtp_concat", il);
+
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
+    cb(cur, "mtp_eh_proj", il);
+
+    ggml_tensor * inpL = cur;
+
+    cur = build_norm(cur, layer.attn_norm, nullptr, cohere2moe_norm_type, il);
+    cb(cur, "mtp_attn_norm", il);
+    ggml_tensor * ffn_inp = cur;
+
+    auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il);
+    ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+    Qcur = ggml_rope_ext(
+            ctx0, Qcur, inp_pos, rope_factors,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+    Kcur = ggml_rope_ext(
+            ctx0, Kcur, inp_pos, rope_factors,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    cb(Qcur, "mtp_Qcur", il);
+    cb(Kcur, "mtp_Kcur", il);
+    cb(Vcur, "mtp_Vcur", il);
+
+    cur = build_attn(inp_attn,
+            layer.wo, layer.wo_b, layer.wo_s,
+            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+            1.0f / sqrtf(float(n_embd_head)), il);
+    cb(cur, "mtp_attn_out", il);
+
+    ggml_tensor * attn_out = cur;
+
+    cur = build_moe_ffn(ffn_inp,
+            layer.ffn_gate_inp,
+            layer.ffn_up_exps,
+            layer.ffn_gate_exps,
+            layer.ffn_down_exps,
+            nullptr,
+            n_expert, n_expert_used,
+            LLM_FFN_SILU, hparams.expert_weights_norm,
+            hparams.expert_weights_scale,
+            (llama_expert_gating_func_type) hparams.expert_gating_func,
+            il,
+            nullptr, layer.ffn_gate_up_exps,
+            layer.ffn_up_exps_s,
+            layer.ffn_gate_exps_s,
+            layer.ffn_down_exps_s);
+    cb(cur, "mtp_ffn_moe_out", il);
+
+    if (layer.ffn_up_shexp) {
+        ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
+                layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
+                layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
+                layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
+                nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(ffn_shexp, "mtp_ffn_shexp", il);
+
+        cur = ggml_add(ctx0, cur, ffn_shexp);
+        cur = ggml_scale(ctx0, cur, 0.5f);
+        cb(cur, "mtp_ffn_out", il);
+    }
+
+    cur = ggml_add(ctx0, cur, inpL);
+    cur = ggml_add(ctx0, cur, attn_out);
+    cb(cur, "mtp_post_ffn", il);
+
+    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
+            ? layer.nextn.shared_head_norm
+            : model.output_norm;
+    GGML_ASSERT(head_norm_w && "COHERE2MOE MTP: missing both nextn.shared_head_norm and output_norm");
+    cur = build_norm(cur, head_norm_w, nullptr, cohere2moe_norm_type, -1);
+
+    cb(cur, "h_nextn", -1);
+    res->t_h_nextn = cur;
+
+    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    cb(cur, "mtp_shared_head_norm", -1);
+
+    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    GGML_ASSERT(head_w && "COHERE2MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
+    cur = build_lora_mm(head_w, cur, layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : nullptr);
+
+    if (hparams.f_logit_scale) {
+        cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
@@ -937,6 +937,23 @@ struct llama_model_cohere2 : public llama_model_base {
 };


+struct llama_model_cohere2moe : public llama_model_base {
+    llama_model_cohere2moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    struct graph_mtp : public llm_graph_context {
+        graph_mtp(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_dbrx : public llama_model_base {
    llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {}
    void load_arch_hparams(llama_model_loader & ml) override;
@@ -435,6 +435,24 @@ static void test_expressions(testing & t) {
        "('c', 'b', 'a')"
    );

+    test_template(t, "string slice negative step",
+        "{{ 'abcdef'[::-2] }}",
+        json::object(),
+        "fdb"
+    );
+
+    test_template(t, "string slice negative start and step",
+        "{{ 'abcdef'[-1:1:-1] }}",
+        json::object(),
+        "fedc"
+    );
+
+    test_template(t, "string slice negative start, stop and step",
+        "{{ 'abcdef'[-1:-5:-1] }}",
+        json::object(),
+        "fedc"
+    );
+
    test_template(t, "arithmetic",
        "{{ (a + b) * c }}",
        {{"a", 2}, {"b", 3}, {"c", 4}},
@@ -1320,6 +1338,12 @@ static void test_string_methods(testing & t) {
        "hello jinja"
    );

+    test_template(t, "string.replace() empty",
+        "{{ s.replace('', '.') }}",
+        {{"s", "hello world"}},
+        ".h.e.l.l.o. .w.o.r.l.d."
+    );
+
    test_template(t, "string.replace() with count",
        "{{ s.replace('a', 'X', 2) }}",
        {{"s", "banana"}},
@@ -185,7 +185,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
        ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA,              10000.0f);
        // SWA pattern: every 5th layer is full attention (matches E2B layer_types)
        ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5));
-    } else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
+    } else if (arch == LLM_ARCH_COHERE2MOE || arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
        std::vector<uint32_t> pattern;
        pattern.reserve(n_layer);
        for (uint32_t il = 0; il < n_layer; il++) {
@@ -322,6 +322,7 @@ static std::vector<float> get_logits(
 static bool moe_mandatory(const llm_arch arch) {
    switch (arch) {
        case LLM_ARCH_LLAMA4:
+        case LLM_ARCH_COHERE2MOE:
        case LLM_ARCH_GROK:
        case LLM_ARCH_QWEN2MOE:
        case LLM_ARCH_QWEN3MOE:
@@ -26,52 +26,6 @@ server_http_context::server_http_context()

 server_http_context::~server_http_context() = default;

-// transform path --> asset name ; rules:
-// delete "_app/" prefix
-// delete hash, for ex: bundle.HCjcCZFH.css --> bundle.css
-//                      workbox-12bd46aa.js --> workbox.js
-static std::string asset_name_from_path(const std::string & path) {
-    // Strip leading slash
-    std::string s = (!path.empty() && path[0] == '/') ? path.substr(1) : path;
-    // Strip _app/ prefix
-    if (s.size() > 5 && s.compare(0, 5, "_app/") == 0) {
-        s = s.substr(5);
-    }
-    // Strip hash segment from filename:
-    //   bundle.HCjcCZFH.css -> bundle.css  (name.HASH.ext)
-    //   workbox-12bd46aa.js -> workbox.js  (name-HEXHASH.ext)
-    size_t slash = s.rfind('/');
-    std::string dir  = (slash != std::string::npos) ? s.substr(0, slash + 1) : "";
-    std::string file = (slash != std::string::npos) ? s.substr(slash + 1)    : s;
-
-    auto is_alnum_hash = [](const std::string & h) {
-        if (h.size() < 6 || h.size() > 16) return false;
-        for (char c : h) { if (!isalnum((unsigned char)c)) return false; }
-        return true;
-    };
-    auto is_hex_hash = [](const std::string & h) {
-        if (h.size() < 6 || h.size() > 16) return false;
-        for (char c : h) { if (!isxdigit((unsigned char)c)) return false; }
-        return true;
-    };
-
-    size_t dot1 = file.find('.');
-    if (dot1 != std::string::npos) {
-        size_t dot2 = file.find('.', dot1 + 1);
-        if (dot2 != std::string::npos && is_alnum_hash(file.substr(dot1 + 1, dot2 - dot1 - 1))) {
-            file = file.substr(0, dot1) + file.substr(dot2);
-        } else {
-            size_t dot = file.rfind('.');
-            size_t dash = file.rfind('-', dot);
-            if (dash != std::string::npos && is_hex_hash(file.substr(dash + 1, dot - dash - 1))) {
-                file = file.substr(0, dash) + file.substr(dot);
-            }
-        }
-    }
-
-    return dir + file;
-}
-
 static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
    // skip logging requests that are regularly sent, to avoid log spam
    if (req.path == "/health"
@@ -240,9 +194,8 @@ bool server_http_context::init(const common_params & params) {
            return true;
        }

-        // If path is public or a UI asset (including hashed paths like /_app/bundle.XXX.js),
-        // skip validation
-        if (get_public_endpoints.count("/" + asset_name_from_path(req.path))) {
+        // If path is public or a UI asset, skip validation
+        if (get_public_endpoints.count(req.path)) {
            return true;
        }

@@ -366,8 +319,26 @@ bool server_http_context::init(const common_params & params) {
            }
        } else {
 #if defined(LLAMA_UI_HAS_ASSETS)
+            static auto handle_gzip_header = [](const httplib::Request & req, httplib::Response & res) {
+                if (!llama_ui_use_gzip()) {
+                    // no gzip build, skip
+                    return true;
+                }
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.status = 415; // unsupported media type
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                    return false;
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                }
+                return true;
+            };
+
            auto serve_asset_cached = [](const std::string & name, bool isolation) {
                return [name, isolation](const httplib::Request & req, httplib::Response & res) {
+                    if (!handle_gzip_header(req, res)) {
+                        return true; // returns error message
+                    }
                    const llama_ui_asset * a = llama_ui_find_asset(name);
                    if (!a) { res.status = 404; return false; }
                    res.set_header("ETag", a->etag);
@@ -387,7 +358,10 @@ bool server_http_context::init(const common_params & params) {
            };

            auto serve_asset_nocache = [](const std::string & name) {
-                return [name](const httplib::Request & /*req*/, httplib::Response & res) {
+                return [name](const httplib::Request & req, httplib::Response & res) {
+                    if (!handle_gzip_header(req, res)) {
+                        return true; // returns error message
+                    }
                    const llama_ui_asset * a = llama_ui_find_asset(name);
                    if (!a) {
                        res.status = 404;
@@ -399,17 +373,9 @@ bool server_http_context::init(const common_params & params) {
                };
            };

-            // Hashed routes: browser requests contain the build hash, assets are stored without.
-            auto serve_hashed = [serve_asset_cached](const std::string & name) {
-                return serve_asset_cached(name, false);
-            };
-            srv->Get(params.api_prefix + R"(/_app/immutable/bundle\.[^/]+\.js)",         serve_hashed("bundle.js"));
-            srv->Get(params.api_prefix + R"(/_app/immutable/assets/bundle\.[^/]+\.css)", serve_hashed("bundle.css"));
-            srv->Get(params.api_prefix + R"(/workbox-[^/]+\.js)",                        serve_hashed("workbox.js"));
-
-            // SPA entry — also aliased at "/_app/version.json" (referenced by the service worker)
-            srv->Get(params.api_prefix + "/",                  serve_asset_cached ("index.html",   true));
-            srv->Get(params.api_prefix + "/_app/version.json", serve_asset_nocache("version.json"));
+            // main index file
+            srv->Get(params.api_prefix + "/",           serve_asset_cached("index.html", true));
+            srv->Get(params.api_prefix + "/index.html", serve_asset_cached("index.html", true));

            // All remaining assets registered directly from the embedded asset table.
            // PWA revalidation files (sw.js, manifest, version.json) use no-cache;
@@ -417,15 +383,14 @@ bool server_http_context::init(const common_params & params) {
            static const std::unordered_set<std::string> no_cache_names = {
                "sw.js",
                "manifest.webmanifest",
-                "version.json",
+                "_app/version.json",
                "build.json"
            };
-            // index.html also accessible at /index.html (with the same isolation headers as /)
-            srv->Get(params.api_prefix + "/index.html", serve_asset_cached("index.html", true));

            for (const auto & a : llama_ui_get_assets()) {
                if (a.name == "index.html") continue;  // served at "/" and "/index.html" above
                if (no_cache_names.count(a.name)) {
+                    SRV_DBG("serve nocache for %s\n", a.name.c_str());
                    srv->Get(params.api_prefix + "/" + a.name, serve_asset_nocache(a.name));
                } else {
                    srv->Get(params.api_prefix + "/" + a.name, serve_asset_cached(a.name, false));
@@ -1,6 +1,7 @@
 set(TARGET llama-ui)

 set(LLAMA_UI_HF_BUCKET "ggml-org/llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets")
+set(LLAMA_UI_GZIP      ON                  CACHE BOOL   "Apply gzip compress to assets to save bandwidth")

 # Backward compat: forward old var to new one
 if(DEFINED LLAMA_BUILD_WEBUI)
@@ -83,6 +84,7 @@ add_custom_target(llama-ui-assets ALL
        "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
        "-DBUILD_UI=${LLAMA_BUILD_UI}"
        "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}"
+        "-DLLAMA_UI_GZIP=${LLAMA_UI_GZIP}"
        -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake"
    COMMENT "Provisioning UI assets"
    VERBATIM
@@ -3,7 +3,8 @@
 // Usage:
 //   llama-ui-embed <out_cpp> <out_h> [<asset_dir>]
 //
-// Embeds every regular file directly under <asset_dir> (non-recursive).
+// Recursively embeds every regular file under <asset_dir>.
+// Asset names are relative paths from <asset_dir> (e.g. "_app/immutable/bundle.HASH.js").
 // Without <asset_dir>, emits an empty asset table.

 #include <inttypes.h>
@@ -15,6 +16,7 @@
 #include <algorithm>
 #include <filesystem>
 #include <fstream>
+#include <functional>
 #include <string>
 #include <vector>

@@ -103,7 +105,24 @@ static bool write_if_different(const std::string & path, const std::string & con
    if (!content.empty()) {
        out.write(content.data(), static_cast<std::streamsize>(content.size()));
    }
-    return out.good();
+    bool ok = out.good();
+    if (ok) {
+        printf("embed: write output file %s\n", path.c_str());
+    }
+    return ok;
+}
+
+static std::string path_basename(const std::string & name) {
+    const size_t p = name.rfind('/');
+    return p == std::string::npos ? name : name.substr(p + 1);
+}
+static bool str_starts_with(const std::string & s, const char * prefix) {
+    const size_t n = strlen(prefix);
+    return s.size() >= n && s.compare(0, n, prefix) == 0;
+}
+static bool str_ends_with(const std::string & s, const char * suffix) {
+    const size_t n = strlen(suffix);
+    return s.size() >= n && s.compare(s.size() - n, n, suffix) == 0;
 }

 static std::string fmt(const char * pattern, ...) {
@@ -126,15 +145,19 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const std::string out_cpp = argv[1];
-    const std::string out_h   = argv[2];
+    const std::string out_cpp   = argv[1];
+    const std::string out_h     = argv[2];
+    const std::string asset_dir = (argc >= 4) ? argv[3] : std::string();
+
+    const bool        use_gzip = !asset_dir.empty() && std::filesystem::exists(asset_dir + "/_gzip");
+    const std::string in_dir   = use_gzip ? (asset_dir + "/_gzip") : asset_dir;

    std::vector<asset_entry> assets;
-    if (argc == 4) {
-        const std::filesystem::path dir = argv[3];
+    if (!in_dir.empty()) {
+        const std::filesystem::path dir = in_dir;

        std::error_code ec;
-        std::filesystem::directory_iterator it(dir, ec);
+        std::filesystem::recursive_directory_iterator it(dir, ec);
        if (ec) {
            fprintf(stderr, "embed: cannot iterate %s: %s\n", argv[3], ec.message().c_str());
            return 1;
@@ -143,7 +166,9 @@ int main(int argc, char ** argv) {
            if (!entry.is_regular_file()) {
                continue;
            }
-            assets.push_back({ entry.path().filename().generic_string(), entry.path() });
+            // name is the relative path from dir, with forward slashes
+            const std::string name = entry.path().lexically_relative(dir).generic_string();
+            assets.push_back({ name, entry.path() });
        }

        // directory iteration order is unspecified; sort for reproducible output
@@ -154,18 +179,51 @@ int main(int argc, char ** argv) {
    const int n_assets = static_cast<int>(assets.size());

    if (n_assets > 0) {
-        bool has_index = false, has_bundle_js = false, has_bundle_css = false, has_version = false;
+        using match_fn = std::function<bool(const std::string &)>;
+        auto exact = [](const char * name) -> match_fn {
+            return [name](const std::string & base) { return base == name; };
+        };
+
+        struct required_check { const char * label; match_fn match; bool found; };
+        required_check checks[] = {
+            { "index.html",           exact("index.html"),           false },
+            { "loading.html",         exact("loading.html"),         false },
+            { "manifest.webmanifest", exact("manifest.webmanifest"), false },
+            { "sw.js",                exact("sw.js"),                false },
+            { "build.json",           exact("build.json"),           false },
+            { "version.json",         exact("version.json"),         false },
+            { "bundle[hash].js",      [](const std::string & b) {
+                return str_starts_with(b, "bundle")  && str_ends_with(b, ".js");
+            }, false },
+            { "bundle[hash].css",     [](const std::string & b) {
+                return str_starts_with(b, "bundle")  && str_ends_with(b, ".css");
+            }, false },
+            { "workbox[hash].js",     [](const std::string & b) {
+                return str_starts_with(b, "workbox") && str_ends_with(b, ".js");
+            }, false },
+        };
+
        for (const auto & a : assets) {
-            if (a.name == "index.html")   has_index      = true;
-            if (a.name == "bundle.js")    has_bundle_js  = true;
-            if (a.name == "bundle.css")   has_bundle_css = true;
-            if (a.name == "version.json") has_version    = true;
-        }
-        if (!has_index || !has_bundle_js || !has_bundle_css || !has_version) {
-            fprintf(stderr, "embed: missing required assets (need index.html, bundle.js, bundle.css, version.json); got:\n");
-            for (const auto & a : assets) {
-                fprintf(stderr, "  %s\n", a.name.c_str());
+            const std::string base = path_basename(a.name);
+            for (auto & c : checks) {
+                if (!c.found) { c.found = c.match(base); }
            }
+        }
+
+        std::vector<const char *> missing;
+        for (const auto & c : checks) {
+            if (!c.found) { missing.push_back(c.label); }
+        }
+        if (!missing.empty()) {
+            fprintf(stderr, "\ncurrent asset files:\n");
+            for (const auto & a : assets) {
+                fprintf(stderr, "    %s\n", a.name.c_str());
+            }
+            fprintf(stderr, "missing required asset(s):\n");
+            for (const char * m : missing) {
+                fprintf(stderr, "    %s\n", m);
+            }
+            fprintf(stderr, "hint: try cleaning your build directory: %s\n", in_dir.c_str());
            return 1;
        }
    }
@@ -183,7 +241,8 @@ int main(int argc, char ** argv) {
        "    std::string           etag;\n"
        "    std::string           type;\n"
        "};\n\n"
-        "const llama_ui_asset * llama_ui_find_asset(const std::string & name);\n";
+        "const llama_ui_asset * llama_ui_find_asset(const std::string & name);\n"
+        "bool llama_ui_use_gzip();\n";
    h += fmt("const std::array<llama_ui_asset, %d> & llama_ui_get_assets();\n", n_assets);

    std::string cpp;
@@ -195,6 +254,10 @@ int main(int argc, char ** argv) {
            if (!read_file(assets[i].path, bytes)) {
                return 1;
            }
+            if (bytes.empty()) {
+                fprintf(stderr, "embed: empty file: %s\n", assets[i].path.generic_string().c_str());
+                return 1;
+            }
            cpp += fmt("static const unsigned char asset_%d_data[] = {", i);
            append_bytes_hex(cpp, bytes);
            const auto hash = fnv_hash(bytes.data(), bytes.size());
@@ -235,6 +298,7 @@ int main(int argc, char ** argv) {
            "    return empty;\n"
            "}\n";
    }
+    cpp += fmt("bool llama_ui_use_gzip() { return %s; }\n", use_gzip ? "true" : "false");

    bool ok = true;
    ok = write_if_different(out_h,   h)   && ok;
@@ -4,7 +4,7 @@
 	"version": "1.0.0",
 	"type": "module",
 	"scripts": {
-		"build": "npm run build-pwa-assets && vite build && node scripts/post-build.js",
+		"build": "npm run build-pwa-assets && vite build",
 		"build-pwa-assets": "npx @vite-pwa/assets-generator --root . --config pwa-assets.config.ts && npx @vite-pwa/assets-generator --root . --config pwa-assets-dark.config.ts && node scripts/make-icons-circular.js",
 		"dev": "bash scripts/dev.sh",
 		"preview": "vite preview",
@@ -1,40 +0,0 @@
-#!/usr/bin/env node
-// Post-build: copy hashed/nested assets to predictable flat names.
-// No file content is modified — the C++ server handles routing hashed URLs
-// to the correct stored asset at runtime.
-//
-// Copies:
-//   _app/immutable/bundle.HASH.js         -> bundle.js
-//   _app/immutable/assets/bundle.HASH.css -> bundle.css
-//   workbox-HEXHASH.js                    -> workbox.js
-//   _app/version.json                     -> version.json
-
-import fs from 'fs';
-import path from 'path';
-
-const outDir = process.env.LLAMA_UI_OUT_DIR ?? './dist';
-
-function findOne(dir, pattern) {
-	const files = fs.readdirSync(dir).filter((f) => pattern.test(f));
-	if (files.length === 0) throw new Error(`post-build: no file matching ${pattern} in ${dir}`);
-	return path.join(dir, files[0]);
-}
-
-function copyFlat(src, destName) {
-	const dest = path.join(outDir, destName);
-	fs.copyFileSync(src, dest);
-	console.log(`post-build: ${path.relative(outDir, src)} -> ${destName}`);
-}
-
-const bundleJs = findOne(path.join(outDir, '_app/immutable'), /^bundle\.[^.]+\.js$/);
-const bundleCss = findOne(path.join(outDir, '_app/immutable/assets'), /^bundle\.[^.]+\.css$/);
-const workbox = findOne(outDir, /^workbox-[0-9a-f]+\.js$/);
-
-copyFlat(bundleJs, 'bundle.js');
-copyFlat(bundleCss, 'bundle.css');
-copyFlat(workbox, 'workbox.js');
-
-const versionSrc = path.join(outDir, '_app/version.json');
-if (fs.existsSync(versionSrc)) {
-	copyFlat(versionSrc, 'version.json');
-}
Author	SHA1	Message	Date
Aldehir Rojas	53bd47ea5b	ui : fix llama-ui-embed crash when no asset dir is given (#24597 )	2026-06-13 17:53:30 -05:00
Michael Wand	4988f6e866	Add arch support for cohere2-MoE (#24260 ) * Add arch support for cohere2-MoE * Removed redundant gating_func checks * Changed ffn lookup to prefer prefix_dense_intermediate_size * Renamed arch to cohere2moe * Removed redundant lmhead check and chat template changes * Removed lm_head.weight check from modify tensors, load output tensor not required, fallback to token_embd.weight * Changed to (routed+shared)0.5 for shared expert combined avg fixed sliding_window_pattern issue and pattern * Fixed transformers crash 'first_k_dense_replace' error * Remove comment * Removed cohere2-moe as a tokenizer type and kept as tiny_aya. Renamed North-Mini-Code-1.0. * Fixed MTP fail, changed to use iSWA * Fixed remaining todos: cohere2moe renamed, changed swa parsing to use get_key_or_arr, removed extra get_arr use * Force metadata usage Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove Cohere2 checkpoint comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Remove MTP comment Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Regenerate cohere2moe tokenizer hash * Add cohere2moe to Llama Model Saver supported list * Check for zerobios tensors and add support for Command to use LayerNorm * Map expert_selection_fn to sigmoid in base.py instead of command.py * use bools for foundnorm/foundnormrms Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-06-13 19:49:00 +02:00
Sigbjørn Skjæret	f05cf4676a	jinja : fix negative step slice with start/stop values (#24580 )	2026-06-13 18:28:40 +02:00
Xuan-Son Nguyen	e8067a8b36	ui: build-time gzip compression (#24571 ) * ui: keep original file name and path * fix nocache * ui: build-time gzip compression	2026-06-13 16:57:27 +02:00
Sigbjørn Skjæret	341babcf73	jinja : fix split and replace with empty first arg (#24574 ) * fix split and replace with empty first arg * fix reserve size	2026-06-13 16:56:59 +02:00
Jeff Bolz	1a7718b4c5	vulkan: support non-contig unary/glu ops (#24215 ) * vulkan: support non-contig unary/glu ops Change unary/glu ops to pass in all strides and use fastdiv for the index calculation. Put all unary ops in one file, similar to glu, to share the code. codex went ahead and added expm1 without me asking, but I had to make it do a real precision analysis rather than just making stuff up. unary.comp initially couldn't use generic_unary_head because there wasn't space for xielu's additional constants. Fixing this required packing the fastdiv 'L' values. * attempt to workaround compiler bug * resolve conflict from #23991 * use expm1	2026-06-13 08:44:15 -05:00
Xuan-Son Nguyen	597b6672e8	ui: keep original file name and path (#24568 ) * ui: keep original file name and path * fix nocache	2026-06-13 14:31:41 +02:00