memory : fix broken batch splits for recurrent cache

Splits producing more than one ubatch per batch for recurrent models were broken with #14512. This fixes it by moving the completeness check after the ubatch split loop.
2025-07-07 21:23:14 -04:00
90 changed files with 2064 additions and 32159 deletions
@@ -342,7 +342,7 @@ jobs:
          cd build
          export GGML_VK_VISIBLE_DEVICES=0
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4200
+          ctest -L main --verbose --timeout 3600

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
@@ -1,40 +0,0 @@
-name: Update Operations Documentation
-
-on:
-    push:
-        paths:
-            - 'docs/ops/**'
-            - 'scripts/create_ops_docs.py'
-    pull_request:
-        paths:
-            - 'docs/ops/**'
-            - 'scripts/create_ops_docs.py'
-
-jobs:
-    update-ops-docs:
-        runs-on: ubuntu-latest
-
-        steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-
-        - name: Set up Python
-          uses: actions/setup-python@v5
-          with:
-              python-version: '3.x'
-
-        - name: Generate operations documentation to temporary file
-          run: |
-              mkdir -p /tmp/ops_check
-              ./scripts/create_ops_docs.py /tmp/ops_check/ops.md
-
-        - name: Check if docs/ops.md matches generated version
-          run: |
-              if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
-                  echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
-                  echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
-                  echo "Differences found:"
-                  diff docs/ops.md /tmp/ops_check/ops.md || true
-                  exit 1
-              fi
-              echo "Operations documentation is up to date."
@@ -55,17 +55,6 @@
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
        }
    },
-    {
-        "name": "x64-linux-gcc", "hidden": true,
-        "cacheVariables": {
-            "CMAKE_C_COMPILER": "gcc",
-            "CMAKE_CXX_COMPILER": "g++"
-        }
-    },
-    { "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
-    { "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
-    { "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
-    { "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },

    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
@@ -6,9 +6,9 @@
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
 [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)

-[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)

-LLM inference in C/C++
+Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

 ## Recent API changes

@@ -17,9 +17,10 @@ LLM inference in C/C++

 ## Hot topics

- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
+- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
+- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -133,7 +134,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)

 #### Multimodal

@@ -86,7 +86,8 @@ if (LLAMA_CURL)
    endif()
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
+    find_library(CURL_LIBRARY curl REQUIRED)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

 if (LLAMA_LLGUIDANCE)
@@ -111,13 +112,13 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v1.0.1:
-        GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
+        # v0.7.20 (+ fix to build on GCC 15):
+        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
        CONFIGURE_COMMAND ""
-        BUILD_COMMAND cargo build --release --package llguidance
+        BUILD_COMMAND cargo build --release
        INSTALL_COMMAND ""
        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
        UPDATE_COMMAND ""
@@ -2734,13 +2734,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.public_path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
-    add_opt(common_arg(
-        {"--api-prefix"}, "PREFIX",
-        string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.api_prefix = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
    add_opt(common_arg(
        {"--no-webui"},
        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -370,7 +370,6 @@ struct common_params {

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
-    std::string api_prefix    = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
@@ -300,7 +300,6 @@ class ModelBase:
                            gguf.MODEL_TENSOR.POS_EMBD,
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
                            gguf.MODEL_TENSOR.SSM_CONV1D,
-                            gguf.MODEL_TENSOR.SHORTCONV_CONV,
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -816,30 +815,6 @@ class TextModel(ModelBase):
        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
            res = "minerva-7b"
-        if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
-            # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
-            res = "hunyuan"
-        if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
-            # ref: https://huggingface.co/skt/A.X-4.0
-            res = "a.x-4.0"
-        if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
-            res = "falcon-h1"
-        if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
-            res = "falcon-h1"
-        if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
-            res = "falcon-h1"
-        if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
-            # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
-            res = "falcon-h1"
-        if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
-            # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
-            res = "midm-2.0"
-        if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
-            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
-            res = "lfm2"

        if res is None:
            logger.warning("\n")
@@ -1082,14 +1057,7 @@ class TextModel(ModelBase):
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        if special_vocab.chat_template is None:
-            template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
-            if template_path.is_file():
-                with open(template_path, "r", encoding="utf-8") as f:
-                    template = f.read()
-            else:
-                template = "rwkv-world"
-            special_vocab.chat_template = template
+        special_vocab.chat_template = "rwkv-world"
        # hack: Add '\n\n' as the EOT token to make it chat normally
        special_vocab._set_special_token("eot", 261)
        # hack: Override these as they have already been set (incorrectly)
@@ -4904,9 +4872,6 @@ class Mamba2Model(TextModel):
            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                hparams = json.load(f)
        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
-        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
-        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
-        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1

    def set_vocab(self):
        vocab_size = self.hparams["vocab_size"]
@@ -4929,29 +4894,30 @@ class Mamba2Model(TextModel):
            self._set_vocab_builtin("gpt-neox", vocab_size)

    def set_gguf_parameters(self):
-        d_conv  = self.find_hparam(["conv_kernel", "d_conv"],     optional=True) or 4
-        d_state = self.find_hparam(["state_size",  "d_state"],    optional=True) or 128
-        head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
+        d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
+        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
+        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
+        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 128
+        head_dim = self.find_hparam(["head_dim"],                    optional=True) or 64
+        n_group = self.find_hparam(["n_groups"],                     optional=True) or 1

        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

        # Fail early for models which don't have a block expansion factor of 2
        # TODO: does this really matter?
-        # skip the assertion for FalconH1 Model
-        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
-            assert self.d_inner == 2 * self.d_model
-            assert self.d_inner % head_dim == 0
+        assert d_inner == 2 * d_model
+        assert d_inner % head_dim == 0

        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
-        self.gguf_writer.add_embedding_length(self.d_model)
+        self.gguf_writer.add_embedding_length(d_model)
        self.gguf_writer.add_feed_forward_length(0)  # unused, but seemingly required when loading
        self.gguf_writer.add_head_count(0)  # unused, but seemingly required when loading
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_ssm_conv_kernel(d_conv)
-        self.gguf_writer.add_ssm_inner_size(self.d_inner)
+        self.gguf_writer.add_ssm_inner_size(d_inner)
        self.gguf_writer.add_ssm_state_size(d_state)
-        self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
-        self.gguf_writer.add_ssm_group_count(self.n_group)
+        self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
+        self.gguf_writer.add_ssm_group_count(n_group)
        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
        self.gguf_writer.add_file_type(self.ftype)

@@ -4976,7 +4942,10 @@ class Mamba2Model(TextModel):
            # (D is also unsqueezed, but for more straightforward broadcast internally)
            data_torch = data_torch.reshape((*data_torch.shape, 1))
        elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
-            data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
+            d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
+            d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
+            n_group = self.hparams.get("n_groups", 1)
+            data_torch = data_torch.reshape((n_group, d_inner // n_group))

        if name.endswith(".A_log"):
            logger.debug("A_log --> A ==> " + new_name)
@@ -4985,123 +4954,6 @@ class Mamba2Model(TextModel):
        yield (new_name, data_torch)


-@ModelBase.register("JambaForCausalLM")
-class JambaModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.JAMBA
-
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        del tokenizer  # unused
-
-        return "gpt-2"
-
-    def set_vocab(self):
-        if (self.dir_model / "tokenizer.model").is_file():
-            # Using Jamba's tokenizer.json causes errors on model load
-            # (something about "byte not found in vocab"),
-            # but there's a working tokenizer.model
-            self._set_vocab_sentencepiece()
-        else:
-            # Some Jamba models only have a tokenizer.json, which works.
-            self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
-        d_conv  = self.find_hparam(["mamba_d_conv"],  optional=True) or 4
-        d_inner = self.hparams["mamba_expand"] * d_model
-        d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
-        # ceiling division
-        # ref: https://stackoverflow.com/a/17511341/22827863
-        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
-        dt_rank      = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
-        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
-        n_kv_head = self.hparams["num_key_value_heads"]
-        attn_offset = self.hparams["attn_layer_offset"]
-        attn_period = self.hparams["attn_layer_period"]
-        n_kv_vec = [0 for _ in range(attn_offset)] + [
-            n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
-        ]
-
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
-        self.gguf_writer.add_embedding_length(d_model)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(n_kv_vec)
-        self.gguf_writer.add_ssm_conv_kernel(d_conv)
-        self.gguf_writer.add_ssm_inner_size(d_inner)
-        self.gguf_writer.add_ssm_state_size(d_state)
-        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
-        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
-        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
-        self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-
-        # Mini-Jamba
-        name = name.replace(".moe.", ".feed_forward.")
-        if bid is not None:
-            moe_offset = self.hparams["expert_layer_offset"]
-            moe_period = self.hparams["expert_layer_period"]
-
-            if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
-                name = name.replace(".experts.0.", ".")
-
-        # process the experts separately
-        if ".feed_forward.experts." in name:
-            n_experts = self.hparams["num_experts"]
-
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-
-                # merge the experts into a single 3d tensor
-                for wid in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    # using the same merged name as qwen2moe
-                    merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    yield new_name, data_torch
-            return
-
-        new_name = self.map_tensor_name(name)
-
-        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
-            data_torch = data_torch.squeeze()
-
-        if name.endswith(".A_log"):
-            logger.debug("A_log --> A ==> " + new_name)
-            data_torch = -torch.exp(data_torch)
-
-        yield (new_name, data_torch)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@ModelBase.register("CohereForCausalLM")
 class CommandR2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.COMMAND_R
@@ -6463,148 +6315,18 @@ class GraniteMoeModel(GraniteModel):
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]

-        has_experts = bool(self.hparams.get('num_local_experts'))
-
        if name.endswith("shared_mlp.input_linear.weight"):
            ffn_dim = self.hparams["shared_intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
            gate, up = data_torch.split(ffn_dim, dim=-2)
-            if has_experts:
-                return [
-                    (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
-                    (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
-                ]
            return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
-            ]
-
-        if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
-            return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
            ]

        return super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
-class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
-    """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
-    layers and optionally uses MoE w/ a shared expert"""
-    model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
-    undo_permute = True
-
-    def __init__(self, *args, **kwargs):
-
-        # Hybrid mamba models use a prefix for the mamba-specific params.
-        # TODO: Extend this if the prefix(es) need to be configurable
-        self.hparam_prefixes = ["mamba"]
-
-        super().__init__(*args, **kwargs)
-
-        # Lists of which layers use ssm vs attention
-        self._attn_layers = self.get_attn_layers()
-        self._ssm_layers = [
-            i for i in range(self.block_count)
-            if i not in self._attn_layers
-        ]
-
-        # n_group and d_inner are used during reshape_tensors for mamba2
-        self.d_model = self.find_hparam(["hidden_size", "d_model"])
-        self.n_group = self.find_hparam(["n_groups"])
-        self.d_inner = self.find_hparam(["expand"]) * self.d_model
-
-    def get_attn_layers(self):
-        # Explicit list of layer type names
-        if layer_types := self.hparams.get("layer_types"):
-            return [
-                i for i, typ in enumerate(layer_types)
-                if typ == "attention"
-            ]
-
-        # Layer types indicated by index or period
-        attn_layers = self.hparams.get("attn_layer_indices", [])
-        if not attn_layers:
-            attn_period = self.hparams.get("attn_layer_period")
-            assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
-            attn_offset = self.hparams.get("attn_layer_offset")
-            assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
-            attn_layers = [
-                i for i in range(self.block_count)
-                if i % attn_period == attn_offset
-            ]
-        return attn_layers
-
-    def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
-        prefixed = []
-        for pfx in self.hparam_prefixes:
-            prefixed.extend(
-                "_".join([pfx, k])
-                for k in keys
-            )
-        keys = list(keys) + prefixed
-        return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
-
-    def modify_tensors(
-        self, data_torch: Tensor, name: str, bid: int | None
-    ) -> Iterable[tuple[str, Tensor]]:
-        if (
-            name.endswith("block_sparse_moe.input_linear.weight")
-            or "shared_mlp" in name
-        ):
-            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
-
-        # Determine whether this is a mamba layer or an attention layer
-        if bid in self._ssm_layers:
-            return Mamba2Model.modify_tensors(self, data_torch, name, bid)
-        elif bid in self._attn_layers:
-            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def set_gguf_parameters(self):
-        """This method merges params from both parents and some that are
-        specific to this model. The result is some duplication of how the params
-        get set. The following warnings are expected during conversion:
-
-        WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
-        WARNING:Duplicated key name 'granitehybrid.context_length'
-        """
-        GraniteMoeModel.set_gguf_parameters(self)
-
-        ## Mamba mixer params ##
-        self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
-        self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
-        self.gguf_writer.add_ssm_group_count(self.n_group)
-        self.gguf_writer.add_ssm_inner_size(self.d_inner)
-        # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
-        #   in llama.cpp
-        self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
-
-        ## Attention params ##
-        head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
-        head_count_kv_vec = [
-            head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
-        ]
-        if rope_dim := self.hparams.get("attn_rotary_emb"):
-            self.gguf_writer.add_rope_dimension_count(rope_dim)
-        self.gguf_writer.add_head_count_kv(head_count_kv_vec)
-
-        ## If Bamba, use rope, otherwise don't
-        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
-        self.gguf_writer.add_rope_scaling_finetuned(use_rope)
-        if not use_rope:
-            self.gguf_writer.add_context_length(2**20)
-
-        ## Validation ##
-        d_head = self.find_hparam(["d_head"], optional=True) or 64
-        assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
-        assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
-
-    def set_vocab(self):
-        self.hparams["pad_vocab_size_multiple"] = 8
-        Mamba2Model.set_vocab(self)
-
-
@ModelBase.register("BailingMoeForCausalLM")
 class BailingMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -6813,321 +6535,6 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])

-
-@ModelBase.register("FalconH1ForCausalLM")
-class FalconH1Model(Mamba2Model):
-    model_arch = gguf.MODEL_ARCH.FALCON_H1
-
-    def __init__(self, *args, **kwargs):
-        # Set the hparam prefixes for Falcon Mamba2
-        self.hparam_prefixes = ["mamba"]
-
-        # Initialize the base Mamba2Model
-        super().__init__(*args, **kwargs)
-
-        # Use Llama conversion for attention
-        self._transformer_model_class = LlamaModel
-
-        # n_group and d_inner are used during reshape_tensors for mamba2
-        self.n_group = self.find_hparam(["n_groups"])
-        self.d_inner = self.find_hparam(["mamba_d_ssm"])
-        self.d_head = self.find_hparam(["d_head"])
-
-        # Initialize any Falcon Mamba2 specific attributes
-        self.has_attention = True  # Falcon Mamba2 has attention components
-
-        # Load Falcon-H1 multipliers from hyperparameters
-        self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
-        self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
-        self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
-        self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
-        self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
-        self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
-        self.intermediate_size = self.find_hparam(["intermediate_size"])
-        self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
-
-    def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
-        prefixed = []
-        for pfx in self.hparam_prefixes:
-            prefixed.extend(
-                "_".join([pfx, k])
-                for k in keys
-            )
-        keys = list(keys) + prefixed
-        return super().find_hparam(keys, *args, **kwargs)
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        tensors = list(super().modify_tensors(data_torch, name, bid))
-        tensor = tensors[0][1]
-
-        if "down_proj" in name:
-            tensor = tensor  * self.mlp_multipliers[1]
-        elif "gate_proj" in name:
-            tensor = tensor * self.mlp_multipliers[0]
-        elif "k_proj" in name:
-            tensor = tensor * self.key_multiplier * self.attention_in_multiplier
-        elif "q_proj" in name:
-            tensor = tensor * self.attention_in_multiplier
-        elif "v_proj" in name:
-            tensor = tensor * self.attention_in_multiplier
-        elif "o_proj" in name:
-            tensor = tensor * self.attention_out_multiplier
-        elif "out_proj" in name:
-            tensor = tensor * self.ssm_out_multiplier
-        elif "in_proj" in name:
-            tensor = tensor * self.ssm_in_multiplier
-            zxbcdt_multipliers = self.hparams["ssm_multipliers"]
-            intermediate_size = self.hparams["mamba_d_ssm"]
-            groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
-            tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
-            tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
-            tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
-            tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
-            tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
-        elif "lm_head" in name:
-            tensor = tensor * self.hparams["lm_head_multiplier"]
-        elif "embed_tokens" in name:
-            tensor = tensor * self.hparams["embedding_multiplier"]
-        elif "mamba.norm" in name:
-            tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
-
-        tensors = [(tensors[0][0], tensor)]
-        return tensors
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        ## General Params ##
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        # Override some Mamba2 defaults
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-
-        ## Attention params ##
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_key_length(self.hparams["head_dim"])
-        self.gguf_writer.add_value_length(self.hparams["head_dim"])
-
-        ## Validation ##
-        assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
-        assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
-
-        # Add any other Falcon Mamba2 specific configuration
-        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
-
-
-@ModelBase.register("HunYuanMoEV1ForCausalLM")
-class HunYuanMoEModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # For handling tied embeddings
-        self._tok_embd = None
-
-    def set_vocab(self):
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
-
-        # 1. Get the pre-tokenizer identifier hash
-        tokpre = self.get_vocab_base_pre(tokenizer)
-
-        # 2. Reverse-engineer the merges list from mergeable_ranks
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[QwenModel.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            if len(merged) == 2: # todo this is an assert in Qwen, why?
-                merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-
-        # 3. Generate the tokens and toktypes lists
-        vocab_size = self.hparams["vocab_size"]
-        assert tokenizer.vocab_size == vocab_size
-        special_tokens = tokenizer.special_tokens
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
-        tokens: list[str] = []
-        toktypes: list[int] = []
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
-                toktypes.append(gguf.TokenType.UNUSED)
-            else:
-                token = reverse_vocab[i]
-                tokens.append(token)
-                if i in special_tokens.values():
-                    toktypes.append(gguf.TokenType.CONTROL)
-                else:
-                    toktypes.append(gguf.TokenType.NORMAL)
-
-        # 4. Write all vocab-related fields to the GGUF writer
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_token_merges(merges)
-
-        # 5. Add special tokens and chat templates
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        special_vocab.add_to_gguf(self.gguf_writer)
-        # FIX for BOS token: Overwrite incorrect id read from config.json
-        self.gguf_writer.add_bos_token_id(127959) # <|bos|>
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-
-        self.gguf_writer.add_expert_count(hparams["num_experts"])
-        self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
-
-        moe_intermediate_size = hparams["moe_intermediate_size"]
-        assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
-        self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
-
-        moe_topk = hparams["moe_topk"]
-        assert all(topk == moe_topk[0] for topk in moe_topk)
-        self.gguf_writer.add_expert_used_count(moe_topk[0])
-
-        moe_shared_expert = hparams["num_shared_expert"]
-        assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
-        self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
-
-        # Rope
-        rope_scaling = hparams.get("rope_scaling", {})
-        if rope_scaling.get("type") == "dynamic":
-            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
-            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
-            alpha = rope_scaling.get("alpha", 1000)
-            base = hparams.get("rope_theta", 10000.0)
-            dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
-            scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
-            self.gguf_writer.add_rope_freq_base(scaled_base)
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-            self.gguf_writer.add_rope_scaling_factor(1)
-            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
-            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
-
-            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
-            assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
-                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name == "model.embed_tokens.weight":
-            self._tok_embd = data_torch.clone()
-
-        if name == "lm_head.weight":
-            if self.hparams.get("tie_word_embeddings", False):
-                logger.info("Skipping tied output layer 'lm_head.weight'")
-                return []
-
-        if name.find("mlp.experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                tensors: list[tuple[str, Tensor]] = []
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-                    new_name = self.map_tensor_name(merged_name)
-                    tensors.append((new_name, data_torch))
-
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-        if self._experts is not None:
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
-@ModelBase.register("SmolLM3ForCausalLM")
-class SmolLM3Model(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.SMOLLM3
-
-    def set_vocab(self):
-        super().set_vocab()
-        # remove unsupported array slicing in chat template
-        # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
-        if tokenizer.chat_template is not None:
-            chat_template = tokenizer.chat_template.replace("[:]", "")
-            self.gguf_writer.add_chat_template(chat_template)
-
-
-@ModelBase.register("Lfm2ForCausalLM")
-@ModelBase.register("LFM2ForCausalLM")
-class LFM2Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.LFM2
-
-    def _add_feed_forward_length(self):
-        ff_dim = self.hparams["block_ff_dim"]
-
-        auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
-        ff_dim = self.hparams["block_ff_dim"]
-        ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
-        multiple_of = self.hparams["block_multiple_of"]
-
-        if auto_adjust_ff_dim:
-            ff_dim = int(2 * ff_dim / 3)
-            # custom dim factor multiplier
-            if ffn_dim_multiplier is not None:
-                ff_dim = int(ffn_dim_multiplier * ff_dim)
-            ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
-
-        self.gguf_writer.add_feed_forward_length(ff_dim)
-
-    def set_gguf_parameters(self):
-        # set num_key_value_heads only for attention layers
-        self.hparams["num_key_value_heads"] = [
-            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
-            for layer_type in self.hparams["layer_types"]
-        ]
-
-        super().set_gguf_parameters()
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
-        self._add_feed_forward_length()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # conv op requires 2d tensor
-        if 'conv.conv' in name:
-            data_torch = data_torch.squeeze(1)
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
 ###### CONVERSION LOGIC ######


@@ -128,9 +128,6 @@ models = [
    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
-    {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
-    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
-    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -140,12 +137,6 @@ pre_computed_hashes = [
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
-    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
-    # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
-    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
-    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
-    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
-    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
 ]


@@ -83,22 +83,20 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv

 ### 2. Define the model architecture in `llama.cpp`

-The model params and tensors layout must be defined in `llama.cpp` source files:
-1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
-2. In `src/llama-arch.cpp`:
-    - Add the architecture name to the `LLM_ARCH_NAMES` map.
-    - Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
-3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
-4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non-standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`

 NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.

 ### 3. Build the GGML graph implementation

-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
-Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
-Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
-Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.

 Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

@@ -1,95 +0,0 @@
-# GGML Operations
-
-List of GGML operations and backend support status.
-
-Legend:
- ✅ Fully supported by this backend
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
-
-| Operation | BLAS | CPU | CUDA | Metal |
-|-----------|------|------|------|------|
-|                              ABS | ❌ | ✅ | 🟡 | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ |
-|                              ADD | ❌ | ✅ | ✅ | 🟡 |
-|                             ADD1 | ❌ | ✅ | ✅ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ |
-|                            CLAMP | ❌ | ✅ | ✅ | 🟡 |
-|                           CONCAT | ❌ | ✅ | 🟡 | ✅ |
-|                             CONT | ❌ | ✅ | 🟡 | ✅ |
-|                       CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
-|                CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | 🟡 |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 |
-|               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
-|                              DIV | ❌ | ✅ | ✅ | 🟡 |
-|                              DUP | ❌ | ✅ | 🟡 | 🟡 |
-|                              ELU | ❌ | ✅ | ❌ | 🟡 |
-|                              EXP | ❌ | ✅ | 🟡 | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
-|                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | 🟡 |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
-|                             GELU | ❌ | ✅ | 🟡 | 🟡 |
-|                         GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
-|                       GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
-|                         GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
-|                    GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
-|                      HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
-|                        HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | 🟡 |
-|                          L2_NORM | ❌ | ✅ | ✅ | ✅ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
-|                              LOG | ❌ | ✅ | ✅ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ |
-|                              MUL | ❌ | ✅ | ✅ | 🟡 |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
-|                              NEG | ❌ | ✅ | 🟡 | 🟡 |
-|                             NORM | ❌ | ✅ | ✅ | 🟡 |
-|                   OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
-|                          POOL_2D | ❌ | ✅ | ✅ | ✅ |
-|                            REGLU | ❌ | ✅ | ✅ | 🟡 |
-|                             RELU | ❌ | ✅ | 🟡 | 🟡 |
-|                           REPEAT | ❌ | ✅ | 🟡 | ✅ |
-|                      REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
-|                    RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
-|                     RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
-|                             ROPE | ❌ | ✅ | ✅ | ✅ |
-|                        ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
-|                        RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
-|                            SCALE | ❌ | ✅ | ✅ | ✅ |
-|                              SET | ❌ | ✅ | ❌ | ✅ |
-|                         SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
-|                              SGN | ❌ | ✅ | 🟡 | ❌ |
-|                          SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
-|                             SILU | ❌ | ✅ | 🟡 | 🟡 |
-|                        SILU_BACK | ❌ | ✅ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | 🟡 |
-|                         SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
-|                    SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | 🟡 |
-|                             SQRT | ❌ | ✅ | ✅ | 🟡 |
-|                         SSM_CONV | ❌ | ✅ | ✅ | ✅ |
-|                         SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
-|                             STEP | ❌ | ✅ | 🟡 | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | 🟡 |
-|                              SUM | ❌ | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
-|                           SWIGLU | ❌ | ✅ | ✅ | 🟡 |
-|                             TANH | ❌ | ✅ | 🟡 | 🟡 |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
-|                          UPSCALE | ❌ | ✅ | ✅ | 🟡 |
@@ -495,7 +495,7 @@ extern "C" {
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_POOL_2D_BACK,
-        GGML_OP_UPSCALE,
+        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_PAD_REFLECT_1D,
        GGML_OP_ROLL,
@@ -1297,19 +1297,6 @@ extern "C" {
            struct ggml_tensor  * a,
            float                 s);

-    // x = s * a + b
-    GGML_API struct ggml_tensor * ggml_scale_bias(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        float                 b);
-
-    GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        float                 b);
-
    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set(
            struct ggml_context * ctx,
@@ -2090,7 +2090,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            {
                // TODO: add support
                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
-#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
                return false;
            } break;
        case GGML_OP_CPY: {
@@ -2189,6 +2188,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_CLAMP:
@@ -2210,10 +2210,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_COUNT_EQUAL:
            return true;
-        case GGML_OP_SCALE:
-            float bias;
-            memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
-            return bias == 0.0f; // TODO: support bias != 0.0f
        case GGML_OP_SOFT_MAX:
            // TODO: support broadcast
            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
@@ -4643,11 +4643,9 @@ static void ggml_compute_forward_scale_f32(
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

-    float s; // scale factor
-    float b; // bias
-
-    memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;
@@ -4666,22 +4664,12 @@ static void ggml_compute_forward_scale_f32(

    const size_t nb1 = dst->nb[1];

-    if (b == 0.0f) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            if (dst->data != src0->data) {
-                // src0 is same shape as dst => same indices
-                // TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
-                memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
-            }
-            ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
-        }
-    } else {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            ggml_vec_mad1_f32(nc,
-                (float *) ((char *) dst->data  + i1*nb1),
-                (float *) ((char *) src0->data + i1*nb1),
-                s, b);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
        }
+        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
    }
 }

@@ -351,45 +351,6 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
 #endif
 }

-inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
-#if defined(GGML_USE_ACCELERATE)
-    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
-#elif defined(GGML_SIMD)
-    #if defined(__ARM_FEATURE_SVE)
-        // scalar ; TODO: Write SVE code
-        for (int i = 0; i < n; ++i) {
-            y[i] = x[i]*s + b;
-        }
-    #else
-        const int np = (n & ~(GGML_F32_STEP - 1));
-
-        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
-        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
-
-        GGML_F32_VEC ay[GGML_F32_ARR];
-
-        for (int i = 0; i < np; i += GGML_F32_STEP) {
-            for (int j = 0; j < GGML_F32_ARR; j++) {
-                ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
-
-                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-            }
-        }
-
-        // leftovers
-        for (int i = np; i < n; ++i) {
-            y[i] = x[i]*s + b;
-        }
-    #endif
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] = x[i]*s + b;
-    }
-#endif
-}
-
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)
@@ -176,20 +176,17 @@ static const char * cu_get_error_str(CUresult err) {
 #endif

 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
-#    define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes)                                                       \
-        do {                                                                                                   \
-            static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false };                         \
-            const int   id                                                = ggml_cuda_get_device();            \
-            if (!shared_memory_limit_raised[id]) {                                                             \
-                CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
-                shared_memory_limit_raised[id] = true;                                                         \
-            }                                                                                                  \
-        } while (0)
+#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
+    do { \
+        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; \
+        const int id = ggml_cuda_get_device(); \
+        if (!shared_memory_limit_raised[id]) { \
+            CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
+            shared_memory_limit_raised[id] = true; \
+        } \
+    } while (0)
 #else
-#    define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
-        do {                                             \
-            GGML_UNUSED(nbytes);                         \
-        } while (0)
+#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)

 #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
@@ -299,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32(
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
-    GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32);
-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
-    GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }
@@ -337,15 +337,13 @@ static __global__ void flash_attn_vec_ext_f32(
    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
-    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
-    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
-    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
-    GGML_UNUSED(ne31); GGML_UNUSED(ne32);
-    GGML_UNUSED(nb31); GGML_UNUSED(nb32);
-    GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
-    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
-    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
-    GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }
@@ -43,7 +43,6 @@
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/wkv.cuh"
 #include "ggml-cuda/gla.cuh"
-#include "ggml-cuda/set-rows.cuh"
 #include "ggml.h"

 #include <algorithm>
@@ -2231,9 +2230,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_GET_ROWS_BACK:
            ggml_cuda_op_get_rows_back(ctx, dst);
            break;
-        case GGML_OP_SET_ROWS:
-            ggml_cuda_op_set_rows(ctx, dst);
-            break;
        case GGML_OP_DUP:
            ggml_cuda_dup(ctx, dst);
            break;
@@ -2303,9 +2299,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_EXP:
                    ggml_cuda_op_exp(ctx, dst);
                    break;
-                case GGML_UNARY_OP_ELU:
-                    ggml_cuda_op_elu(ctx, dst);
-                    break;
                default:
                    return false;
            }
@@ -3119,7 +3112,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
-                case GGML_UNARY_OP_ELU:
                    return ggml_is_contiguous(op->src[0]);
                default:
                    return false;
@@ -3224,13 +3216,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
            } break;
-        case GGML_OP_SET_ROWS:
-            {
-#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
-                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
-                       op->src[0]->type == GGML_TYPE_F32 &&
-                       op->src[1]->type == GGML_TYPE_I64;
-            } break;
        case GGML_OP_CPY:
            {
                ggml_type src0_type = op->src[0]->type;
@@ -3350,8 +3335,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SSM_SCAN: {
            if (op->src[3]->ne[0] == 1) {
                // Mamba2
-                // (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
-                return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
+                // (kernel only supports d_state == 128 && d_head % 16 == 0)
+                return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0;
            } else {
                // Mamba
                // (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
@@ -3390,6 +3375,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_GROUP_NORM:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_UPSCALE:
+            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
@@ -50,19 +50,21 @@ static __global__ void rope_norm(

    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
    const int row_x     = row_dst % ne1;
    const int channel_x = row_dst / ne1;

    const int idst = row_dst*ne0 + i0;
    const int ix   = channel_x*s2 + row_x*s1 + i0;

-    if (i0 >= n_dims) {
-        dst[idst + 0] = x[ix + 0];
-        dst[idst + 1] = x[ix + 1];
-
-        return;
-    }
-
    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -92,19 +94,21 @@ static __global__ void rope_neox(

    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
    const int row_x     = row_dst % ne1;
    const int channel_x = row_dst / ne1;

    const int idst = row_dst*ne0 + i0/2;
    const int ix   = channel_x*s2 + row_x*s1 + i0/2;

-    if (i0 >= n_dims) {
-        dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
-        dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
-
-        return;
-    }
-
    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -134,19 +138,21 @@ static __global__ void rope_multi(

    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
    const int row_x     = row_dst % ne1;
    const int channel_x = row_dst / ne1;

    const int idst = row_dst*ne0 + i0/2;
    const int ix   = channel_x*s2 + row_x*s1 + i0/2;

-    if (i0 >= n_dims) {
-        dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
-        dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
-
-        return;
-    }
-
    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
    const int sec_w = sections.v[1] + sections.v[0];
    const int sector = (i0 / 2) % sect_dims;
@@ -1,18 +1,18 @@
 #include "scale.cuh"

-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
    }

-    dst[i] = scale * x[i] + bias;
+    dst[i] = scale * x[i];
 }

-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }

 void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -25,9 +25,7 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    float scale;
-    float bias;
-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&bias,  (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&scale, dst->op_params, sizeof(float));

-    scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream);
+    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
 }
@@ -1,151 +0,0 @@
-#include "set-rows.cuh"
-
-typedef void (*set_rows_kernel_t)(const char * src, char * dst);
-
-template<typename src_t, typename dst_t>
-__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
-    GGML_UNUSED(src_f);
-    GGML_UNUSED(dst_f);
-}
-
-template<>
-__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
-    *dst_h = __float2half(*src_f);
-}
-
-template<>
-__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
-    *dst_b = *src_f;
-}
-
-template<>
-__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
-    *dst_f = *src_f;
-}
-
-template<typename src_t, typename dst_t>
-static __global__ void k_set_rows(
-        const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
-        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
-        const int64_t s01, const int64_t s02, const int64_t s03,
-        const int64_t s10, const int64_t s11, const int64_t s12,
-        const int64_t s1, const int64_t s2, const int64_t s3) {
-
-    const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
-    const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
-
-    if (i >= ne_total) {
-        return;
-    }
-
-    const int64_t i03 = i / (ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
-    const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
-    const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
-
-    const int64_t i12 = i03 % ne12;
-    const int64_t i11 = i02 % ne11;
-    const int64_t i10 = i01;
-
-    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
-
-    const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
-    dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;
-
-    const src_t* src_elem = src0_row + i00;
-    dst_t* dst_elem = dst_row_ptr + i00;
-    set_rows_1(src_elem, dst_elem);
-
-    GGML_UNUSED(ne10);
-    GGML_UNUSED(ne13);
-}
-
-template<typename src_t, typename dst_t>
-static void set_rows_cuda(
-        const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
-        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
-        const size_t nb01, const size_t nb02, const size_t nb03,
-        const size_t nb10, const size_t nb11, const size_t nb12,
-        const size_t nb1, const size_t nb2, const size_t nb3,
-        cudaStream_t stream) {
-
-    const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
-    const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
-    const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
-    const dim3 grid_size(num_blocks);
-
-
-    const int64_t s01 = nb01/sizeof(src_t);
-    const int64_t s02 = nb02/sizeof(src_t);
-    const int64_t s03 = nb03/sizeof(src_t);
-    const int64_t s10 = nb10/sizeof(int64_t);
-    const int64_t s11 = nb11/sizeof(int64_t);
-    const int64_t s12 = nb12/sizeof(int64_t);
-    const int64_t s1  = nb1/sizeof(dst_t);
-    const int64_t s2  = nb2/sizeof(dst_t);
-    const int64_t s3  = nb3/sizeof(dst_t);
-
-    if (ne_total > 0) {
-        k_set_rows<<<grid_size, block_size, 0, stream>>>(
-            src0_d, src1_d, dst_d,
-            ne00, ne01, ne02, ne03,
-            ne10, ne11, ne12, ne13,
-            s01, s02, s03,
-            s10, s11, s12,
-            s1, s2, s3);
-    }
-}
-
-
-void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_I64);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const float * src0_d   = (const float *)src0->data;
-    const int64_t * src1_d = (const int64_t *)src1->data;
-
-    cudaStream_t stream = ctx.stream();
-
-
-
-    if (dst->type == GGML_TYPE_F32) {
-        set_rows_cuda(
-            src0_d, src1_d, (float*)dst->data,
-            ne00, ne01, ne02, ne03,
-            ne10, ne11, ne12, ne13,
-            nb01, nb02, nb03,
-            nb10, nb11, nb12,
-            nb1, nb2, nb3,
-            stream
-        );
-    } else if (dst->type == GGML_TYPE_F16) {
-        set_rows_cuda(
-            src0_d, src1_d, (half*)dst->data,
-            ne00, ne01, ne02, ne03,
-            ne10, ne11, ne12, ne13,
-            nb01, nb02, nb03,
-            nb10, nb11, nb12,
-            nb1, nb2, nb3,
-            stream
-        );
-    } else if (dst->type == GGML_TYPE_BF16) {
-        set_rows_cuda(
-            src0_d, src1_d, (nv_bfloat16*)dst->data,
-            ne00, ne01, ne02, ne03,
-            ne10, ne11, ne12, ne13,
-            nb01, nb02, nb03,
-            nb10, nb11, nb12,
-            nb1, nb2, nb3,
-            stream
-        );
-    } else {
-        GGML_ABORT("unsupported type");
-    }
-}
@@ -1,7 +0,0 @@
-#pragma once
-
-#include "common.cuh"
-
-#define CUDA_SET_ROWS_BLOCK_SIZE 256
-
-void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -107,11 +107,8 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
        if (nc == 4) {
            ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
+            GGML_ABORT("Only support kernel size = 4  now.");
        }
    } else {
        if (nc == 4) {
@@ -119,13 +116,8 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            const int64_t split_n_t = 32;
-            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
-                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
+            GGML_ABORT("Only support kernel size = 4 right now.");
        }
    }
 }
@@ -201,11 +201,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
                              const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
                              const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
                              cudaStream_t stream) {
+    const int threads = 128;
    // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
    if (src3_nb1 == sizeof(float)) {
        // Mamba-2
        if (d_state == 128) {
-            const int threads = 128;
            GGML_ASSERT(d_state % threads == 0);
            // NOTE: can be any power of two between 4 and 64
            const int splitH = 16;
@@ -215,21 +215,10 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
                    src0, src1, src2, src3, src4, src5, src6, dst,
                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
-        } else if (d_state == 256) { // Falcon-H1
-            const int threads = 256;
-            // NOTE: can be any power of two between 8 and 64
-            const int splitH = 16;
-            GGML_ASSERT(head_dim % splitH == 0);
-            const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
-            ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
-                    src0, src1, src2, src3, src4, src5, src6, dst,
-                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
-                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
        } else {
-            GGML_ABORT("doesn't support d_state!=(128 or 256).");
+            GGML_ABORT("doesn't support d_state!=128.");
        }
    } else {
-        const int threads = 128;
        // Mamba-1
        GGML_ASSERT(n_head % threads == 0);
        GGML_ASSERT(head_dim == 1);
@@ -83,10 +83,6 @@ static __device__ __forceinline__ float op_log(float x) {
    return logf(x);
 }

-static __device__ __forceinline__ float op_elu(float x) {
-    return (x > 0.f) ? x : expm1f(x);
-}
-
 template <float (*op)(float), typename T>
 static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -200,9 +196,6 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_log>(ctx, dst);
 }

-void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_elu>(ctx, dst);
-}
 /* gated ops */

 template <float (*op)(float), typename T>
@@ -59,8 +59,6 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

-void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
 void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -22,88 +22,17 @@ static __global__ void upscale_f32(const float * x, float * dst,
    dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) );
 }

-static __global__ void upscale_f32_bilinear(const float * x, float * dst,
-        const int nb00, const int nb01, const int nb02, const int nb03,
-        const int ne00_src, const int ne01_src,
-        const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
-        const float sf0, const float sf1, const float sf2, const float sf3,
-        const float pixel_offset) {
-    const int64_t index              = threadIdx.x + blockIdx.x * blockDim.x;
-    const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
-
-    if (index >= dst_total_elements) {
-        return;
-    }
-
-    const int i10_dst = index % ne10_dst;
-    const int i11_dst = (index / ne10_dst) % ne11_dst;
-    const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
-    const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
-
-    const int i02_src = (int)(i12_dst / sf2);
-    const int i03_src = (int)(i13_dst / sf3);
-
-    const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
-    int y0_src    = (int)floorf(y_src_f);
-    int y1_src    = y0_src + 1;
-
-    y0_src = max(0, min(y0_src, ne01_src - 1));
-    y1_src = max(0, min(y1_src, ne01_src - 1));
-
-    float dy = y_src_f - (float)y0_src;
-    dy       = max(0.0f, min(dy, 1.0f));
-
-    float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
-    int x0_src    = (int)floorf(x_src_f);
-    int x1_src    = x0_src + 1;
-
-    x0_src = max(0, min(x0_src, ne00_src - 1));
-    x1_src = max(0, min(x1_src, ne00_src - 1));
-
-    float dx = x_src_f - (float)x0_src;
-    dx = max(0.0f, min(dx, 1.0f));
-
-    const float * p_a = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
-    const float * p_b = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
-    const float * p_c = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
-    const float * p_d = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
-
-    const float val_a = *p_a;
-    const float val_b = *p_b;
-    const float val_c = *p_c;
-    const float val_d = *p_d;
-
-    float result = val_a * (1.0f - dx) * (1.0f - dy) +
-                   val_b * dx * (1.0f - dy) +
-                   val_c * (1.0f - dx) * dy +
-                   val_d * dx * dy;
-
-    dst[index] = result;
-}
-
 static void upscale_f32_cuda(const float * x, float * dst,
        const int nb00, const int nb01, const int nb02, const int nb03,
        const int ne10, const int ne11, const int ne12, const int ne13,
        const float sf0, const float sf1, const float sf2, const float sf3,
        cudaStream_t stream) {
-    const int64_t dst_size   = ne10 * ne11 * ne12 * ne13;
-    const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    int dst_size = ne10 * ne11 * ne12 * ne13;
+    int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;

    upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
 }

-static void upscale_f32_bilinear_cuda(const float * x, float * dst,
-        const int nb00, const int nb01, const int nb02, const int nb03,
-        const int ne00_src, const int ne01_src,
-        const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
-        const float sf0, const float sf1, const float sf2, const float sf3,
-        const float pixel_offset, cudaStream_t stream) {
-    const int64_t dst_size   = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
-    const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-
-    upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
-}
-
 void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
@@ -113,25 +42,10 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    const int mode_flags = dst->op_params[0];
-    const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF);
-
-    float sf0 = (float)dst->ne[0]/src0->ne[0];
-    float sf1 = (float)dst->ne[1]/src0->ne[1];
-    float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const float sf0 = (float)dst->ne[0]/src0->ne[0];
+    const float sf1 = (float)dst->ne[1]/src0->ne[1];
+    const float sf2 = (float)dst->ne[2]/src0->ne[2];
    const float sf3 = (float)dst->ne[3]/src0->ne[3];

-    if (mode == GGML_SCALE_MODE_NEAREST) {
-        upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
-    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
-        float pixel_offset = 0.5f;
-        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-            sf0          = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
-            sf1          = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
-            pixel_offset = 0.0f;
-        }
-        upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
-                                 src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-                                 sf0, sf1, sf2, sf3, pixel_offset, stream);
-    }
+    upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
 }
@@ -10,6 +10,9 @@
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__

+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
@@ -27,6 +30,7 @@
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
 #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
@@ -38,6 +42,7 @@
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -139,20 +144,6 @@
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED

-#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
-#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
-#define cublasComputeType_t hipblasComputeType_t
-#define cudaDataType_t hipDataType
-#else
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define cublasComputeType_t hipblasDatatype_t
-#define cudaDataType_t hipblasDatatype_t
-#endif
-
 #define __CUDA_ARCH__ 1300

 #if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
@@ -173,12 +173,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_SILU,
    GGML_METAL_KERNEL_TYPE_SILU_4,
    GGML_METAL_KERNEL_TYPE_ELU,
-    GGML_METAL_KERNEL_TYPE_ABS,
-    GGML_METAL_KERNEL_TYPE_SGN,
-    GGML_METAL_KERNEL_TYPE_STEP,
-    GGML_METAL_KERNEL_TYPE_HARDSWISH,
-    GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
-    GGML_METAL_KERNEL_TYPE_EXP,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
@@ -1161,12 +1155,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                          silu_4,                          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU,                             elu,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS,                             abs,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN,                             sgn,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP,                            step,                            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH,                       hardswish,                       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID,                     hardsigmoid,                     true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP,                             exp,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                    soft_max_f16,                    has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                  soft_max_f16_4,                  has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                    soft_max_f32,                    has_simdgroup_reduction);
@@ -1700,12 +1688,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_NEG:
-                case GGML_UNARY_OP_ABS:
-                case GGML_UNARY_OP_SGN:
-                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_HARDSWISH:
-                case GGML_UNARY_OP_HARDSIGMOID:
-                case GGML_UNARY_OP_EXP:
                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                default:
                    return false;
@@ -2274,9 +2256,7 @@ static bool ggml_metal_encode_node(
                GGML_ASSERT(ggml_is_contiguous(src0));

                float scale;
-                float bias;
-                memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(float));
-                memcpy(&bias,  ((const int32_t *) dst->op_params) + 1, sizeof(float));
+                memcpy(&scale, dst->op_params, sizeof(scale));

                int64_t n = ggml_nelements(dst);

@@ -2293,7 +2273,6 @@ static bool ggml_metal_encode_node(
                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-                [encoder setBytes:&bias  length:sizeof(bias)  atIndex:3];

                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
            } break;
@@ -2457,78 +2436,6 @@ static bool ggml_metal_encode_node(

                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
-                case GGML_UNARY_OP_ABS:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-                case GGML_UNARY_OP_SGN:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-                case GGML_UNARY_OP_STEP:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-                case GGML_UNARY_OP_HARDSWISH:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-                case GGML_UNARY_OP_HARDSIGMOID:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-                case GGML_UNARY_OP_EXP:
-                {
-                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline;
-
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                    const int64_t n = ggml_nelements(dst);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
                default:
                {
                    GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
@@ -1014,18 +1014,16 @@ kernel void kernel_scale(
        device const float * src0,
        device       float * dst,
        constant     float & scale,
-        constant     float & bias,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale + bias;
+    dst[tpig] = src0[tpig] * scale;
 }

 kernel void kernel_scale_4(
        device const float4 * src0,
        device       float4 * dst,
        constant     float  & scale,
-        constant     float  & bias,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale + bias;
+    dst[tpig] = src0[tpig] * scale;
 }

 kernel void kernel_clamp(
@@ -1199,51 +1197,6 @@ kernel void kernel_neg(
    dst[tpig] = -src0[tpig];
 }

-kernel void kernel_abs(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = fabs(src0[tpig]);
-}
-
-kernel void kernel_sgn(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-    dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f);
-}
-
-kernel void kernel_step(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f;
-}
-
-kernel void kernel_hardswish(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-    dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
-}
-
-kernel void kernel_hardsigmoid(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-    dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
-}
-
-kernel void kernel_exp(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = exp(src0[tpig]);
-}
-
 kernel void kernel_reglu(
        device const char * src0,
        device const char * src1,
@@ -88,7 +88,6 @@ set(GGML_OPENCL_KERNELS
    rms_norm
    rope
    scale
-    set_rows
    sigmoid
    silu
    softmax_4_f32
@@ -104,7 +103,6 @@ set(GGML_OPENCL_KERNELS
    tanh
    pad
    repeat
-    mul_mat_f16_f32
 )

 foreach (K ${GGML_OPENCL_KERNELS})
@@ -351,7 +351,6 @@ struct ggml_backend_opencl_context {
    cl_program program_gemv_noshuffle_general;
    cl_program program_gemv_noshuffle;
    cl_program program_get_rows;
-    cl_program program_set_rows;
    cl_program program_glu;
    cl_program program_im2col_f16;
    cl_program program_im2col_f32;
@@ -368,7 +367,6 @@ struct ggml_backend_opencl_context {
    cl_program program_mul_mv_f16_f32;
    cl_program program_mul_mv_f32_f32;
    cl_program program_mul;
-    cl_program program_mul_mat_f16_f32_tiled;
    cl_program program_div;
    cl_program program_sub;
    cl_program program_norm;
@@ -414,7 +412,6 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_soft_max, kernel_soft_max_4;
    cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
    cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
-    cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
    cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -423,7 +420,6 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mat_f16_f32_1row;
    cl_kernel kernel_mul_mat_f16_f32;
    cl_kernel kernel_mul_mat_f16_f32_l4;
-    cl_kernel kernel_mul_mat_f16_f32_tiled;
    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -533,16 +529,6 @@ struct ggml_backend_opencl_context {
        fclose(ftrace);
    }

-    size_t get_kernel_workgroup_size(cl_kernel kernel) const {
-        size_t workgroup_size = 0;
-        size_t ret_size = 0;
-        CL_CHECK(
-            clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                sizeof(size_t), &workgroup_size, &ret_size));
-        GGML_ASSERT(sizeof(size_t) == ret_size);
-        return workgroup_size;
-    }
-
    void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
 #ifdef GGML_OPENCL_PROFILING
        cl_event evt;
@@ -1017,22 +1003,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        GGML_LOG_CONT(".");
    }

-    // mul_mat_f16_f32_tiled
-    {
-#ifdef GGML_OPENCL_EMBED_KERNELS
-        const std::string kernel_src {
-            #include "mul_mat_f16_f32.cl.h"
-        };
-#else
-        const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
-#endif
-        backend_ctx->program_mul_mat_f16_f32_tiled =
-            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
-        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
-        GGML_LOG_CONT(".");
-    }
-
    // mul
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1461,23 +1431,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        }
    }

-    // set_rows
-    {
-#ifdef GGML_OPENCL_EMBED_KERNELS
-        const std::string kernel_src {
-            #include "set_rows.cl.h"
-        };
-#else
-        const std::string kernel_src = read_file("set_rows.cl");
-#endif
-        backend_ctx->program_set_rows =
-            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
-        CL_CHECK((backend_ctx->kernel_set_rows_f32  = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
-        CL_CHECK((backend_ctx->kernel_set_rows_f16  = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
-        GGML_LOG_CONT(".");
-    }
-
    // mul_mv_id_q4_0_f32_8x_flat
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2280,18 +2233,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
            {
                // TODO: add support
                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
-#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
-                if (op->src[0]->type != GGML_TYPE_F32) {
-                    return false;
-                }
-                switch (op->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                        return true;
-                    default:
-                        return false;
-                }
-            }
+                return false;
+            } break;
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
@@ -3431,111 +3374,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }

-static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0);
-    GGML_ASSERT(src0->extra);
-    GGML_ASSERT(src1);
-    GGML_ASSERT(src1->extra);
-    GGML_ASSERT(dst);
-    GGML_ASSERT(dst->extra);
-
-    // ne0 = ne00
-    // ne2 = ne02
-    // ne3 = ne03
-
-    const int      ne01 = src0->ne[1];
-    const int      ne02 = src0->ne[2];
-    const int      ne03 = src0->ne[3];
-
-    const cl_ulong nb01 = src0->nb[1];
-    const cl_ulong nb02 = src0->nb[2];
-    const cl_ulong nb03 = src0->nb[3];
-
-    const int      ne11 = src1->ne[1];
-    const int      ne12 = src1->ne[2];
-
-    const cl_ulong nb10 = src1->nb[0];
-    const cl_ulong nb11 = src1->nb[1];
-    const cl_ulong nb12 = src1->nb[2];
-
-    const int      ne0  = dst->ne[0];
-
-    const cl_ulong nb1  = dst->nb[1];
-    const cl_ulong nb2  = dst->nb[2];
-    const cl_ulong nb3  = dst->nb[3];
-
-    const int nblk0 = ne0/ggml_blck_size(dst->type);
-
-    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-
-    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
-    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
-    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
-
-    cl_ulong offset0 = extra0->offset + src0->view_offs;
-    cl_ulong offset1 = extra1->offset + src1->view_offs;
-    cl_ulong offsetd = extrad->offset + dst->view_offs;
-
-    cl_kernel kernel;
-
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            kernel = backend_ctx->kernel_set_rows_f32;
-            break;
-        case GGML_TYPE_F16:
-            kernel = backend_ctx->kernel_set_rows_f16;
-            break;
-        default:
-            GGML_ABORT("not implemented");
-    }
-
-    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
-    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
-    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
-    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
-    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
-    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
-    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne01));
-    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
-    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
-    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne11));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
-    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
-    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
-    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &nblk0));
-    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
-    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
-    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
-
-    int nth0 = 64;
-    if (backend_ctx->gpu_family == INTEL) {
-        nth0 = 32;
-    } else if (backend_ctx->gpu_family == ADRENO) {
-        nth0 = 64;
-    }
-
-    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
-    while (nth0 < nblk0 && nth0 < max_workgroup_size) {
-        nth0 *= 2;
-    }
-
-    int rows_per_workgroup = 1;
-    if (nth0 > nblk0) {
-        rows_per_workgroup = nth0 / nblk0;
-        nth0 = nblk0;
-    }
-
-    size_t global_work_size[] = {
-        (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
-        (size_t)ne02*rows_per_workgroup,
-        (size_t)ne03};
-    size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
-
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
-}
-
 static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0);
    GGML_ASSERT(src0->extra);
@@ -4946,58 +4784,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
 }

-static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-
-    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
-    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
-    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
-
-    cl_ulong offset0 = extra0->offset + src0->view_offs;
-    cl_ulong offset1 = extra1->offset + src1->view_offs;
-    cl_ulong offsetd = extrad->offset + dst->view_offs;
-
-    const int M = src0->ne[1];
-    const int N = src1->ne[1];
-    const int K = src0->ne[0];
-
-    cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
-
-    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int),      &M));
-    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int),      &N));
-    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),      &K));
-    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &extra0->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem),   &extra1->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
-    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem),   &extrad->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
-
-    // Tiling parameters. These need to be tuned for optimal performance.
-    // They must match the #defines in the kernel mul_mat_f16_f32.cl.
-    //
-    // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
-    // TPWM / TPWN: Threads per Work-group. This is the work-group size.
-    // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
-    //
-    // The following relationships must hold:
-    //   OPWM = TPWM * OPTM
-    //   OPWN = TPWN * OPTN
-    //
-    const int OPWM = 64;
-    const int OPWN = 64;
-    const int TPWM = 16;
-    const int TPWN = 8;
-
-    size_t local_work_size[2] = { TPWM, TPWN };
-    size_t global_work_size[2] = {
-        (size_t) ((M + OPWM - 1) / OPWM) * TPWM,
-        (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
-    };
-
-    backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
-}
-
 static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0);
    GGML_ASSERT(src0->extra);
@@ -5011,18 +4797,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co

    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

-     if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
-        src0->ne[1] > 32 &&   // M > 32
-        src1->ne[1] > 32 &&   // N > 32
-        src0->ne[0] > 32 &&   // K > 32
-        src0->ne[2] == 1 && src0->ne[3] == 1 &&
-        src1->ne[2] == 1 && src1->ne[3] == 1 &&
-        ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
-        backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
-        ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
-        return;
-    }
-
    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5813,9 +5587,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

    float scale;
-    float bias;
-    memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
-    memcpy(&bias,  ((int32_t *) dst->op_params) + 1, sizeof(float));
+    memcpy(&scale, dst->op_params, sizeof(scale));

    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5830,7 +5602,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &scale));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &bias));

    int n = ggml_nelements(dst)/4;

@@ -6614,12 +6385,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            }
            func = ggml_cl_get_rows;
            break;
-        case GGML_OP_SET_ROWS:
-            if (!any_on_device) {
-                return false;
-            }
-            func = ggml_cl_set_rows;
-            break;
        case GGML_OP_CPY:
            if (!any_on_device) {
                return false;
@@ -1,130 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#if defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#else
-#define REQD_SUBGROUP_SIZE_128
-#endif
-
-#define OPWM 64
-#define OPWN 64
-#define CPWK 8
-#define OPTM 4
-#define OPTN 8
-
-#define WG_M (OPWM / OPTM)
-#define WG_N (OPWN / OPTN)
-#define VEC_K (CPWK / 4)
-
-REQD_SUBGROUP_SIZE_128
-__kernel void mul_mat_f16_f32(
-    const int M, const int N, const int K,
-    __global const void* A_void, ulong A_offset,
-    __global const void* B_void, ulong B_offset,
-    __global       void* C_void, ulong C_offset) {
-
-    __global const half*  A = (__global const half* )((__global const char*)A_void + A_offset);
-    __global const float* B = (__global const float*)((__global const char*)B_void + B_offset);
-    __global       float* C = (__global       float*)((__global       char*)C_void + C_offset);
-
-    const int lidm = get_local_id(0);
-    const int lidn = get_local_id(1);
-    const int lid = lidn * WG_M + lidm;
-
-    const int offsetM = get_group_id(0) * OPWM;
-    const int offsetN = get_group_id(1) * OPWN;
-
-    __local half4  Alocal[OPWM][VEC_K];
-    __local float4 Blocal[OPWN][VEC_K];
-
-    float sum[OPTM][OPTN];
-
-    for (int wm = 0; wm < OPTM; wm++) {
-        for (int wn = 0; wn < OPTN; wn++) {
-            sum[wm][wn] = 0.0f;
-        }
-    }
-
-    const int numTiles = (K + CPWK - 1) / CPWK;
-
-    const int load_row_a = lid % OPWM;
-    const int load_vec_k_a = lid / OPWM;
-    const int global_row_a = offsetM + load_row_a;
-
-    const int load_row_b = lid % OPWN;
-    const int load_vec_k_b = lid / OPWN;
-    const int global_row_b = offsetN + load_row_b;
-
-    for (int t = 0; t < numTiles; t++) {
-        const int k_start = t * CPWK;
-        const int k_vec_start_a = k_start + load_vec_k_a * 4;
-        const int k_vec_start_b = k_start + load_vec_k_b * 4;
-
-        if (global_row_a < M && k_vec_start_a < K) {
-            if (k_vec_start_a + 3 < K) {
-                Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a);
-            } else {
-                half4 tempA = (half4)(0.0h);
-                if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a];
-                if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1];
-                if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2];
-                Alocal[load_row_a][load_vec_k_a] = tempA;
-            }
-        } else {
-            Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h);
-        }
-
-        if (global_row_b < N && k_vec_start_b < K) {
-            if (k_vec_start_b + 3 < K) {
-                Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b);
-            } else {
-                float4 tempB = (float4)(0.0f);
-                if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b];
-                if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1];
-                if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2];
-                Blocal[load_row_b][load_vec_k_b] = tempB;
-            }
-        } else {
-            Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f);
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        #pragma unroll
-        for (int k_vec = 0; k_vec < VEC_K; k_vec++) {
-            float4 a_fvecs[OPTM];
-            int current_row_a = lidm;
-            for (int wm = 0; wm < OPTM; wm++) {
-                a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]);
-                current_row_a += WG_M;
-            }
-
-            float4 b_fvecs[OPTN];
-            int current_row_b = lidn;
-            for (int wn = 0; wn < OPTN; wn++) {
-                b_fvecs[wn] = Blocal[current_row_b][k_vec];
-                current_row_b += WG_N;
-            }
-
-            for (int wm = 0; wm < OPTM; wm++) {
-                for (int wn = 0; wn < OPTN; wn++) {
-                    sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]);
-                }
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    for (int wm = 0; wm < OPTM; wm++) {
-        int globalRow = offsetM + lidm + wm * WG_M;
-        if (globalRow < M) {
-            for (int wn = 0; wn < OPTN; wn++) {
-                int globalCol = offsetN + lidn + wn * WG_N;
-                if (globalCol < N) {
-                    C[globalCol * M + globalRow] = sum[wm][wn];
-                }
-            }
-        }
-    }
-}
@@ -8,10 +8,9 @@ kernel void kernel_scale(
        ulong offset0,
        global float4 * dst,
        ulong offsetd,
-        float scale,
-        float bias
+        float scale
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
-    dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
+    dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
 }
@@ -1,95 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-kernel void kernel_set_rows_f32(
-        global char * src0,
-        ulong         offset0,
-        global char * src1,
-        ulong         offset1,
-        global char * dst,
-        ulong         offsetd,
-        int           ne01,
-        ulong         nb01,
-        ulong         nb02,
-        ulong         nb03,
-        int           ne11,
-        int           ne12,
-        ulong         nb10,
-        ulong         nb11,
-        ulong         nb12,
-        int           nblk0,
-        ulong         nb1,
-        ulong         nb2,
-        ulong         nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst  + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
-
-    if (i01 >= ne01) {
-        return;
-    }
-
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
-
-    int i10 = i01;
-    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
-
-    global float * dst_row = (global float *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
-    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
-
-    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
-        dst_row[ind] = (float)src_row[ind];
-    }
-}
-
-kernel void kernel_set_rows_f16(
-        global char * src0,
-        ulong         offset0,
-        global char * src1,
-        ulong         offset1,
-        global char * dst,
-        ulong         offsetd,
-        int           ne01,
-        ulong         nb01,
-        ulong         nb02,
-        ulong         nb03,
-        int           ne11,
-        int           ne12,
-        ulong         nb10,
-        ulong         nb11,
-        ulong         nb12,
-        int           nblk0,
-        ulong         nb1,
-        ulong         nb2,
-        ulong         nb3
-) {
-    src0 = src0 + offset0;
-    src1 = src1 + offset1;
-    dst  = dst  + offsetd;
-
-    int i03 = get_group_id(2);
-    int i02 = get_group_id(1);
-    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
-
-    if (i01 >= ne01) {
-        return;
-    }
-
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
-
-    int i10 = i01;
-    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
-
-    global half  * dst_row = (global half  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
-    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
-
-    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
-        dst_row[ind] = src_row[ind];
-    }
-}
@@ -30,7 +30,6 @@
 #include "outprod.hpp"
 #include "quants.hpp"
 #include "rope.hpp"
-#include "set_rows.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #include "wkv.hpp"
@@ -32,28 +32,39 @@ public:
        else static_assert(0);
    }

+    // matrix A has m rows, k columns
+    // matrix B has k rows, n columns
+    // nra - number of elements to skip when moving into next row in A
+    // nrb - number of elements to skip when moving into next row in B
+    // nca - number of elements to skip when moving into next column in A
+    // ncb - number of elements to skip when moving into next column in B
+    // stride_a - number of elements to skip when moving to next A matrix
+    // stride_b - number of elements to skip when moving to next B matrix
+    // batches_a - number of A matrices
+    // batches_b - number of B matrices
    static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
-        const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2,
-        const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2,
+        const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
+        const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
        void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {

        auto stream = ctx.stream_dnnl(q);
        auto eng = ctx.engine_dnnl(q);

-        dnnl::memory::dims a_dims = {batches_a, m, k };
-        dnnl::memory::dims a_strides = {stra2, stra1, stra0};
+        // { # strides, # rows, # columns }
+        dnnl::memory::dims a_dims = { batches_a, m, k };
+        dnnl::memory::dims b_dims = { batches_b, k, n };
+        dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
+
+        // { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
+        dnnl::memory::dims a_strides = { stride_a, nra, nca };
+        dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
+
        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
-
-        dnnl::memory::dims b_dims = {batches_b, k, n };
-        dnnl::memory::dims b_strides = {strb2, strb0, strb1};
        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
+        const auto c_md    = dnnl::memory::desc(c_dims, ct, tag::abc);

-        dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n};
-        dnnl::memory::dims c_strides = {m*n, 1,  m };
-        const auto c_md    = dnnl::memory::desc(c_dims, ct, c_strides);
        dnnl::primitive_attr primitive_attr;
        primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-
 #ifdef GGML_SYCL_F16
        primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
 #endif
@@ -65,23 +76,24 @@ public:

        auto scratchpad_md = matmul_pd.scratchpad_desc();
        auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q);
-
        auto matmul_prim = dnnl::matmul(matmul_pd);

        std::unordered_map<int, dnnl::memory> matmul_args;
        matmul_args.insert({ DNNL_ARG_SRC, a_mem });
        matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
-
        matmul_args.insert({ DNNL_ARG_DST, c_mem });
        matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem });

        matmul_prim.execute(stream, matmul_args);
    }

+    // matrices A and B are column major, both having k rows
+    // matrix A has m column, matrix B has n columns
+    // output: column major matrix C = A transposed * B
    static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
        const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {

-        gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
+        gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
    }
 };

@@ -41,7 +41,6 @@
 #include "ggml-sycl/element_wise.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
-#include "ggml-sycl/set_rows.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
 #include "ggml.h"
@@ -1546,7 +1545,7 @@ static void mul_mat_p021_f16_f32(

 static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
    const sycl::nd_item<3> &item_ct1) {

    const sycl::half *x = (const sycl::half *)vx;
@@ -1557,6 +1556,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
                        item_ct1.get_local_id(0);
    const int channel_x = channel / channel_x_divisor;

+    const int nrows_y   = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst   = row_x;

@@ -1575,7 +1575,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
        const int row_y = col_x;

        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel * channel_stride_y + row_y;
+        const int iy = channel*nrows_y + row_y;

        const float xi =
            sycl::vec<sycl::half, 1>(x[ix])
@@ -1695,7 +1695,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
 }

-static void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k,
+static void scale_f32(const float * x, float * dst, const float scale, const int k,
                      const sycl::nd_item<3> &item_ct1) {
    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
                  item_ct1.get_local_id(2);
@@ -1704,7 +1704,7 @@ static void scale_f32(const float * x, float * dst, const float scale, const flo
        return;
    }

-    dst[i] = scale * x[i] + bias;
+    dst[i] = scale * x[i];
 }


@@ -1822,7 +1822,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
 static void ggml_mul_mat_vec_nc_f16_f32_sycl(
    const void *vx, const float *y, float *dst, const int ncols_x,
    const int nrows_x, const int row_stride_x, const int nchannels_x,
-    const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) {
+    const int nchannels_y, const int channel_stride_x, queue_ptr stream) {

    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -1834,7 +1834,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
            sycl::nd_range<3>(block_nums * block_dims, block_dims),
            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
-                                       row_stride_x, channel_stride_x, channel_stride_y,
+                                       row_stride_x, channel_stride_x,
                                       nchannels_y / nchannels_x, item_ct1);
            });
    }
@@ -1842,7 +1842,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(



-static void scale_f32_sycl(const float *x, float *dst, const float scale, const float bias,
+static void scale_f32_sycl(const float *x, float *dst, const float scale,
                           const int k, queue_ptr stream) {
    const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
    stream->parallel_for(
@@ -1850,7 +1850,7 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale, const
                              sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
-            scale_f32(x, dst, scale, bias, k, item_ct1);
+            scale_f32(x, dst, scale, k, item_ct1);
        });
 }

@@ -2123,8 +2123,8 @@ inline void ggml_sycl_op_mul_mat_sycl(

 #if GGML_SYCL_DNNL
        if (!g_ggml_sycl_disable_dnn) {
-                DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr,
-                                     DnnlGemmWrapper::to_dt<sycl::half>(), src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
+            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
+                                      DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
        }
        else
@@ -2170,8 +2170,8 @@ inline void ggml_sycl_op_mul_mat_sycl(

 #if GGML_SYCL_DNNL
        if (!g_ggml_sycl_disable_dnn) {
-            DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
-                                      DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
+            DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
+                                      DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
        }
        else
@@ -2319,11 +2319,9 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds
    float *       dst_dd  = static_cast<float *>(dst->data);

    float scale;
-    float bias;
-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&bias,  (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&scale, dst->op_params, sizeof(float));

-    scale_f32_sycl(src0_dd, dst_dd, scale, bias, ggml_nelements(dst->src[0]), main_stream);
+    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream);
    /*
    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
    error codes. The call was replaced with 0. You need to rewrite this code.
@@ -2775,7 +2773,6 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
    const int64_t nb02 = src0->nb[2];

    const int64_t ne12 = src1->ne[2];
-    const int64_t nb11 = src1->nb[1];

    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
    queue_ptr main_stream = ctx.stream();
@@ -2786,9 +2783,8 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml

    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
-    const int64_t channel_stride_y = nb11 / sizeof(float);

-    ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream);
+    ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2842,8 +2838,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    float *            dst_ddf  = static_cast<float *>(dst->data);

    const sycl::half * src1_f16       = static_cast<const sycl::half *>(src1->data);
-    const size_t       type_size_src0 = ggml_type_size(src0->type);
    const size_t       type_size_src1 = ggml_type_size(src1->type);
+    GGML_ASSERT(nb10 == type_size_src1);

    // SRC1 strides
    int64_t                          s11 = nb11 / type_size_src1;
@@ -2855,40 +2851,11 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    if (src1->type != GGML_TYPE_F16) {
        scope_op_debug_print    scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
                                                " : converting src1 to fp16");
-
-        // iterate tensor dims and find the slowest moving dim and stride
-        int64_t last_dim=0;
-        int64_t last_str=0;
-        int64_t largest_str=0;
-        for(int i = 0; i< 4; i++){
-            // last stride is always the largest
-            if(src1->nb[i] == largest_str){
-                if(src1->ne[last_dim] == 1){
-                    last_str = i;
-                    last_dim = i;
-                }
-            }
-            if(src1->nb[i] > largest_str){
-                largest_str = src1->nb[i];
-                last_str = i;
-                last_dim = i;
-            }
-
-        }
-#if GGML_SYCL_DNNL
-        // oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
-        const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
-        src1_f16_alloc.alloc(ne_src1);
-        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
-        GGML_ASSERT(to_fp16_sycl != nullptr);
-        to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue);
-# else
-        const int64_t ne_src1 = ggml_nelements(src1);
-        src1_f16_alloc.alloc(ne_src1);
        const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
        GGML_ASSERT(to_fp16_nc_sycl != nullptr);
+        const int64_t ne_src1 = ggml_nelements(src1);
+        src1_f16_alloc.alloc(ne_src1);
        to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
-#endif

        src1_f16 = src1_f16_alloc.get();
        s11      = ne10;
@@ -2922,89 +2889,38 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons

 #if GGML_SYCL_DNNL
    if (!g_ggml_sycl_disable_dnn) {
-            int64_t str_a0 = nb00 / type_size_src0;
-            int64_t str_a1 = nb01 / type_size_src0;
-            int64_t str_a2 = nb02 / type_size_src0;
+        auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
+            (const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {

-            int64_t str_b0 = nb10 / type_size_src1;
-            int64_t str_b1 = nb11 / type_size_src1;
-            int64_t str_b2 = nb12 / type_size_src1;
+            DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
+                            src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
+                            src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
+                            dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
+        };

-            auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
-                                                const sycl::half *src1, float *dst,
-                                                int64_t a0, int64_t a1, int64_t batcha,
-                                                int64_t b0, int64_t b1, int64_t batchb,
-                                                int64_t sa0, int64_t sa1, int64_t sa2,
-                                                int64_t sb0, int64_t sb1, int64_t sb2,
-                                                int64_t sd2) {
-                bool supported_broadcast = batchb == batcha ? true
-                        : batchb == 1 || batcha == 1        ? true
-                                                            : false;
-                if (supported_broadcast) {
-                    DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0,
-                            DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2, src1,
-                            DnnlGemmWrapper::to_dt<sycl::half>(), sb0, sb1, sb2, dst,
-                            DnnlGemmWrapper::to_dt<float>(), queue, batcha, batchb);
-                } else {
-                    // iterate over batches from smaller set of matrices (matrix 0)
-                    int64_t batches0 = batcha;
-                    int64_t batches1 = batchb;
-
-                    if (batches0 > batches1) {
-                        int64_t num_mul_mats = batches1;
-                        int64_t sub_batch = batches0 / num_mul_mats;
-                        // src0 is batched and bigger, shift and multiply with src1
-                        for (int64_t i0 = 0; i0 < num_mul_mats; i0++) {
-                            const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch);
-                            const sycl::half *src1_shifted = src1 + (sb2 * i0);
-                            float *dst_shifted = dst + (sd2 * i0 * sub_batch);
-                            DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
-                                    DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
-                                    src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
-                                    sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
-                                    queue, sub_batch, 1);
-                        }
-                    } else {
-                        int64_t num_mul_mats = batches0;
-                        int64_t sub_batch = batches1 / num_mul_mats;
-                        // src1 is batched and bigger, shift and multiply with src0
-                        for (int64_t i1 = 0; i1 < num_mul_mats; i1++) {
-                            const sycl::half *src0_shifted = src0 + (sa2 * i1);
-                            const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch);
-                            float *dst_shifted = dst + (sd2 * i1 * sub_batch);
-                            DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
-                                    DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
-                                    src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
-                                    sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
-                                    queue, 1, sub_batch);
-                        }
-                    }
-                }
-            };
-
-            bool cont_batches_a = nb02 * ne02 == nb03;
-            bool cont_batches_b = nb12 * ne12 == nb13;
-            if (cont_batches_a && cont_batches_b) {
-                int64_t batches0 = ne02 * ne03;
-                int64_t batches1 = ne12 * ne13;
-                launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
-                        ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
-                        str_b2, nb2 / sizeof(float));
-            } else {
-                for (int64_t b_a = 0; b_a < ne03; b_a++) {
-                    const sycl::half *src0_f16_shifted
-                            = src0_f16 + (nb03 * b_a / type_size_src0);
-                    const sycl::half *src1_f16_shifted
-                            = src1_f16 + (nb13 * b_a / type_size_src1);
-                    float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float));
-                    int64_t batches0 = ne02;
-                    int64_t batches1 = ne12;
-                    launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted,
-                            ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1,
-                            str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float));
+        if (r2 == 1 && r3 == 1) {
+            if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+                dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
+            }
+            else {
+                for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
+                    const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
+                    float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
+                    dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
                }
            }
-
+        } else {
+            // iterate over batches from smaller set of matrices (matrix 0)
+            for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
+                for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
+                    const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
+                    const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
+                    float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
+                    dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
+                }
+            }
+        }
    }
    else
 #endif
@@ -3344,10 +3260,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
            // The kernel from the if path is faster for that specific case, but does not support all mul mats.
            ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
        }
-    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
-    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
@@ -3687,9 +3603,6 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_GET_ROWS:
            ggml_sycl_get_rows(ctx, dst);
            break;
-        case GGML_OP_SET_ROWS:
-            ggml_sycl_op_set_rows(ctx, dst);
-            break;
        case GGML_OP_DUP:
            ggml_sycl_dup(ctx, dst);
            break;
@@ -4384,8 +4297,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            {
                // TODO: add support
                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
-#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
-                return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
+                return false;
            } break;
        case GGML_OP_CPY:
            {
@@ -47,17 +47,18 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const

    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);

+    if (i0 >= n_dims) {
+        const int i = row * ne0 + i0;
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
+        return;
+    }
+
    const int row0     = row % ne1;
    const int channel0 = row / ne1;

    const int i  = row * ne0 + i0;
    const int i2 = channel0 * s2 + row0 * s1 + i0;

-    if (i0 >= n_dims) {
-        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i2);
-        return;
-    }
-
    const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);

    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
@@ -87,17 +88,18 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const

    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);

+    if (i0 >= n_dims) {
+        const int i = row * ne0 + i0;
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
+        return;
+    }
+
    const int row0     = row % ne1;
    const int channel0 = row / ne1;

    const int i  = row * ne0 + i0 / 2;
    const int i2 = channel0 * s2 + row0 * s1 + i0 / 2;

-    if (i0 >= n_dims) {
-        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i + i0 / 2) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i2 + i0 / 2);
-        return;
-    }
-
    const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);

    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
@@ -127,16 +129,17 @@ static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const
    }
    const int    row_dst   = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);

+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
+        return;
+    }
+
    const int    row_x     = row_dst % ne1;
    const int    channel_x = row_dst / ne1;
    const int    idst      = (row_dst * ne0) + (i0 / 2);
    const size_t ix        = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);

-    if (i0 >= n_dims) {
-        *reinterpret_cast<sycl::vec<T, 2> *>(dst + idst + i0 / 2) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i0 / 2 + ix);
-        return;
-    }
-
    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
    const int sec_w = sections.v[1] + sections.v[0];
    const int sector = (i0 / 2) % sect_dims;
@@ -1,131 +0,0 @@
-#include "set_rows.hpp"
-
-namespace utils {
-template<typename T>
-static constexpr bool is_arithmetic_v() {
-    return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
-}
-}
-
-template<typename TIn, typename TOut>
-static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
-convert (const char* src, char* dst) {
-    auto src_val = *reinterpret_cast<const TIn*>(src);
-    auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
-   *reinterpret_cast<TOut*>(dst) = dst_val;
-}
-
-template<typename TIn, typename TOut>
-static void k_set_rows(
-        const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
-        const int64_t ne00, const int64_t ne01, const int64_t ne02,
-        const int64_t ne11, const int64_t ne12,
-        const size_t nb01, const size_t nb02, const size_t nb03,
-        const size_t nb10, const size_t nb11, const size_t nb12,
-        const size_t nb1, const size_t nb2, const size_t nb3,
-        const size_t src_type_size, const size_t dst_type_size,
-        const int64_t total_elements,
-        const sycl::nd_item<1> & item_ct1) {
-
-    const int64_t i = item_ct1.get_global_linear_id();
-    if (i >= total_elements) {
-        return;
-    }
-
-    const int64_t i03 = i / (ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
-    const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
-    const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
-
-    const int64_t i12 = i03 % ne12;
-    const int64_t i11 = i02 % ne11;
-    const int64_t i10 = i01;
-
-    const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
-
-    const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
-    const char * src_elem = src0_row + i00 * src_type_size;
-    char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
-    char * dst_elem = dst_row_ptr + i00 * dst_type_size;
-
-    convert<TIn, TOut>(src_elem, dst_elem);
-}
-
-template<typename TIn, typename TOut>
-static void set_rows_sycl(
-        const char * src0_d, const int64_t * src1_d, char * dst_d,
-        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-        const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
-        const size_t nb10, const size_t nb11, const size_t nb12,
-        const size_t nb1, const size_t nb2, const size_t nb3,
-        const size_t src_type_size, const size_t dst_type_size,
-        queue_ptr stream) {
-
-    const int64_t total_elements = ne00 * ne01 * ne02 * ne03;
-
-    constexpr int block_size = 64;
-    const int64_t grid_size = ceil_div(total_elements, block_size);
-
-    sycl_parallel_for(
-        stream,
-        sycl::nd_range<1>(grid_size * block_size, block_size),
-        [=](sycl::nd_item<1> item_ct1) {
-            k_set_rows<TIn, TOut>(
-                src0_d, src1_d, dst_d,
-                ne00, ne01, ne02,
-                ne11, ne12,
-                nb01, nb02, nb03,
-                nb10, nb11, nb12,
-                nb1, nb2, nb3,
-                src_type_size, dst_type_size,
-                total_elements,
-                item_ct1
-            );
-        }
-    );
-}
-
-void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t * src1_dd = static_cast<const int64_t *>(src1->data);
-
-    dpct::queue_ptr stream = ctx.stream();
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            set_rows_sycl<float, float>(
-                (const char *)src0->data, src1_dd, (char *)dst->data,
-                ne00, ne01, ne02, ne03,
-                ne11, ne12,
-                nb01, nb02, nb03,
-                nb10, nb11, nb12,
-                nb1, nb2, nb3,
-                sizeof(float), sizeof(float),
-                stream
-            );
-            break;
-        case GGML_TYPE_F16:
-            dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
-            set_rows_sycl<float, sycl::half>(
-                (const char *)src0->data, src1_dd, (char *)dst->data,
-                ne00, ne01, ne02, ne03,
-                ne11, ne12,
-                nb01, nb02, nb03,
-                nb10, nb11, nb12,
-                nb1, nb2, nb3,
-                sizeof(float), sizeof(sycl::half),
-                stream
-            );
-            break;
-        default:
-            GGML_ABORT("Unsupported tensor type!");
-            break;
-    }
-}
@@ -1,8 +0,0 @@
-#ifndef GGML_SYCL_SET_ROWS_HPP
-#define GGML_SYCL_SET_ROWS_HPP
-
-#include "common.hpp"
-
-void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
-
-#endif // GGML_SYCL_SET_ROWS_HPP
@@ -425,20 +425,18 @@ struct vk_device_struct {
    vk_pipeline pipeline_div_norepeat[2][2][2];

    vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
-    vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
+    vk_pipeline pipeline_upscale_f32;
    vk_pipeline pipeline_scale_f32;
    vk_pipeline pipeline_sqr_f32;
    vk_pipeline pipeline_sin_f32;
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
-    vk_pipeline pipeline_roll_f32;
    vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
    vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
    vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_set_rows[GGML_TYPE_COUNT];
    vk_pipeline pipeline_norm_f32;
    vk_pipeline pipeline_group_norm_f32;
    vk_pipeline pipeline_rms_norm_f32;
@@ -695,37 +693,6 @@ struct vk_op_unary_push_constants {
 };
 static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");

-static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
-    GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
-    ne = ne != 0 ? ne : ggml_nelements(dst);
-    GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
-
-    vk_op_unary_push_constants p{};
-    p.ne = (uint32_t)ne;
-
-    size_t src0_tsize = ggml_type_size(src0->type);
-    p.ne00 = (uint32_t)src0->ne[0];
-    p.ne01 = (uint32_t)src0->ne[1];
-    p.ne02 = (uint32_t)src0->ne[2];
-    p.ne03 = (uint32_t)src0->ne[3];
-    p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
-    p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
-    p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
-    p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
-
-    size_t dst_tsize = ggml_type_size(dst->type);
-    p.ne10 = (uint32_t)dst->ne[0];
-    p.ne11 = (uint32_t)dst->ne[1];
-    p.ne12 = (uint32_t)dst->ne[2];
-    p.ne13 = (uint32_t)dst->ne[3];
-    p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
-    p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
-    p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
-    p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
-
-    return p; // fastdiv values and offsets are initialized later in ggml_vk_op
-}
-
 // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
 // Precompute mp (m' in the paper) and L such that division
 // can be computed using a multiply (high 32b of 64b result)
@@ -895,7 +862,6 @@ struct vk_op_conv2d_dw_push_constants {

 struct vk_op_upscale_push_constants {
    uint32_t ne; uint32_t a_offset; uint32_t d_offset;
-    uint32_t ne00; uint32_t ne01;
    uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
    float sf0; float sf1; float sf2; float sf3;
@@ -1769,14 +1735,7 @@ static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) {
 // number of rows/cols for flash attention shader
 static constexpr uint32_t flash_attention_num_small_rows = 32;
 static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
-
-static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) {
-    if (hsv >= 512) {
-        return 2;
-    } else {
-        return 8;
-    }
-}
+static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;

 // The FA coopmat1 shader assumes 16x16x16 matrix multiply support.
 // 128 threads split into four subgroups, each subgroup does 1/4
@@ -1801,7 +1760,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3
        if (small_rows) {
            return {scalar_flash_attention_num_small_rows, 64};
        } else {
-            return {get_fa_scalar_num_large_rows(hsv), 32};
+            return {scalar_flash_attention_num_large_rows, 32};
        }
    }

@@ -1820,11 +1779,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3

    // small cols to reduce register count
    if (ggml_is_quantized(type) || hsk >= 256) {
-        if (hsk >= 512) {
-            return {32, 32};
-        } else {
-            return {64, 32};
-        }
+        return {64, 32};
    }
    return {64, 64};
 }
@@ -1866,7 +1821,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
    const uint32_t warps = warptile[0] / warptile[10];

    const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
-    const uint32_t mmid_row_ids = mul_mat_id ? (4096 * sizeof(uint32_t) + 4/*_ne1*/) : 0;
+    const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0;
    const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;

    const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
@@ -1991,10 +1946,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
        s_mmq_wg_denoms_k = { 32,  32, 1 };

        // spec constants and tile sizes for quant matmul_id
-        l_warptile_mmqid = { 256, 128, 128, 16, 0 };
+        l_warptile_mmqid = { 256, 128, 64, 16, 0 };
        m_warptile_mmqid = { 256, 128, 64, 16, 0 };
        s_warptile_mmqid = { 256, 128, 64, 16, 0 };
-        l_mmqid_wg_denoms = { 128, 128, 1 };
+        l_mmqid_wg_denoms = { 128, 64, 1 };
        m_mmqid_wg_denoms = { 128, 64, 1 };
        s_mmqid_wg_denoms = { 128, 64, 1 };

@@ -2751,7 +2706,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 4 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);

    for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@@ -2783,41 +2738,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

    if (device->float_controls_rte_fp16) {
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
    } else {
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-    }
-
-    if (device->float_controls_rte_fp16) {
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_rte_len,  set_rows_f32_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_rte_len,  set_rows_f16_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_rte_len, set_rows_bf16_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_rte_len, set_rows_q4_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_rte_len, set_rows_q4_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_rte_len, set_rows_q5_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_rte_len, set_rows_q5_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_rte_len, set_rows_q8_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_rte_len, set_rows_iq4_nl_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-    } else {
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_len,  set_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_len,  set_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_len, set_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_len, set_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_len, set_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_len, set_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_len, set_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_len, set_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_len, set_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
    }

    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
@@ -2857,9 +2790,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

@@ -2871,8 +2802,6 @@ static void ggml_vk_load_shaders(vk_device& device) {

    ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
-
    ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

@@ -6119,7 +6048,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
    // Needs to be kept up to date on shader changes
    GGML_UNUSED(hsv);
    const uint32_t wg_size = scalar_flash_attention_workgroup_size;
-    const uint32_t Br = get_fa_scalar_num_large_rows(hsv);
+    const uint32_t Br = scalar_flash_attention_num_large_rows;
    const uint32_t Bc = scalar_flash_attention_Bc;

    const uint32_t tmpsh = wg_size * sizeof(float);
@@ -6244,7 +6173,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    case FA_SCALAR:
    case FA_COOPMAT1:
        // We may switch from coopmat1 to scalar, so use the scalar limit for both
-        max_gqa = get_fa_scalar_num_large_rows(HSV);
+        max_gqa = scalar_flash_attention_num_large_rows;
        break;
    case FA_COOPMAT2:
        max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
@@ -6323,13 +6252,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count : 16;

    // Try to use split_k when KV is large enough to be worth the overhead
-    if (workgroups_x == 1 && shader_core_count > 0) {
+    if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
        // Try to run two workgroups per SM.
        split_k = shader_core_count * 2 / (workgroups_y * workgroups_z);
        if (split_k > 1) {
            // Try to evenly split KV into split_k chunks, but it needs to be a multiple
            // of "align", so recompute split_k based on that.
-            split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), pipelines[1]->align);
+            split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
            split_k = CEIL_DIV(KV, split_kv);
            workgroups_x = split_k;
        }
@@ -6463,7 +6392,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
                                    },
-                                    pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
+                                    pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
    } else {
        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                    {
@@ -6539,16 +6468,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        }
        return nullptr;
    case GGML_OP_UPSCALE:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            int mode = ggml_get_op_params_i32(dst, 0);
-            switch (mode) {
-                case GGML_SCALE_MODE_NEAREST:
-                    return ctx->device->pipeline_upscale_nearest_f32;
-                case GGML_SCALE_MODE_BILINEAR:
-                    return ctx->device->pipeline_upscale_bilinear_f32;
-                case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
-                    return ctx->device->pipeline_upscale_bilinear_ac_f32;
-            }
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) {
+            return ctx->device->pipeline_upscale_f32;
        }
        return nullptr;
    case GGML_OP_SCALE:
@@ -6581,11 +6502,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_pad_f32;
        }
        return nullptr;
-    case GGML_OP_ROLL:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_roll_f32;
-        }
-        return nullptr;
    case GGML_OP_REPEAT:
        if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
            return ctx->device->pipeline_repeat_f32;
@@ -6600,8 +6516,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    case GGML_OP_CONT:
    case GGML_OP_DUP:
        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
-    case GGML_OP_SET_ROWS:
-        return ctx->device->pipeline_set_rows[dst->type];
    case GGML_OP_SILU_BACK:
        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_silu_back_f32;
@@ -6840,7 +6754,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_RMS_NORM:
    case GGML_OP_CONV_2D_DW:
    case GGML_OP_IM2COL:
-    case GGML_OP_SET_ROWS:
        return true;
    default:
        return false;
@@ -7135,7 +7048,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
-    case GGML_OP_ROLL:
    case GGML_OP_REPEAT:
    case GGML_OP_REPEAT_BACK:
    case GGML_OP_CPY:
@@ -7155,12 +7067,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
                    ne *= ggml_type_size(src0->type) / 2;
                }
            }
-            // copy_to_quant has block size of 32, and each thread does QUANT_K elements.
-            // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements.
-            // So divide by block size here before splitting into 512x512 groups.
-            if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
-                ne = CEIL_DIV(ne, ggml_blck_size(dst->type));
-            }
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
@@ -7169,25 +7075,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
                elements = { ne, 1, 1 };
            }
        } break;
-    case GGML_OP_SET_ROWS:
-        {
-            uint32_t ne = ggml_nelements(src0);
-            if (ggml_is_quantized(dst->type)) {
-                // quants run 32 threads each doing QUANT_K elements
-                ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
-            } else {
-                // scalar types do one element per thread, running 512 threads
-                ne = CEIL_DIV(ne, 512);
-            }
-            if (ne > 262144) {
-                elements = { 512, 512, CEIL_DIV(ne, 262144) };
-            } else if (ne > 512) {
-                elements = { 512, CEIL_DIV(ne, 512), 1 };
-            } else {
-                elements = { ne, 1, 1 };
-            }
-        }
-        break;
    default:
        elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
        break;
@@ -7597,21 +7484,14 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co

 static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
-    const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);

-    float sf0 = (float)dst->ne[0] / src0->ne[0];
-    float sf1 = (float)dst->ne[1] / src0->ne[1];
-    float sf2 = (float)dst->ne[2] / src0->ne[2];
-    float sf3 = (float)dst->ne[3] / src0->ne[3];
-
-    if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-        sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
-        sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
-    }
+    const float sf0 = (float)dst->ne[0] / src0->ne[0];
+    const float sf1 = (float)dst->ne[1] / src0->ne[1];
+    const float sf2 = (float)dst->ne[2] / src0->ne[2];
+    const float sf3 = (float)dst->ne[3] / src0->ne[3];

    ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
        (uint32_t)ggml_nelements(dst), 0, 0,
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
        (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
        sf0, sf1, sf2, sf3,
@@ -7619,64 +7499,123 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
 }

 static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
-    p.param1 = ggml_get_op_params_f32(dst, 0);
-    p.param2 = ggml_get_op_params_f32(dst, 1);
+    float * op_params = (float *)dst->op_params;
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);

-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        op_params[0], 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
-    p.param1 = ggml_get_op_params_f32(dst, 0);
-    p.param2 = ggml_get_op_params_f32(dst, 1);
+    float * op_params = (float *)dst->op_params;
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);

-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        op_params[0], op_params[1],
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
-}
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);

-static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t s2 = ggml_get_op_params_i32(dst, 2);
-    const int32_t s3 = ggml_get_op_params_i32(dst, 3);
-    const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
-    const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
-
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
-    memcpy(&p.param1, &s01_packed, sizeof(float));
-    memcpy(&p.param2, &s23_packed, sizeof(float));
-
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }

 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
    uint32_t ne = (uint32_t)ggml_nelements(src0);
    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
        // Convert from number of logical elements to 2- or 4-byte units.
@@ -7688,22 +7627,13 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
        }
    }

-    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
-    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
-}
-
-static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
-    const uint32_t src1_type_size = ggml_type_size(src1->type);
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
-
-    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
-        (uint32_t)ggml_nelements(src0),
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
-        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
+        ne,
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
-        0.0f, 0.0f, 0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }

@@ -9026,9 +8956,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
-    case GGML_OP_ROLL:
    case GGML_OP_CPY:
-    case GGML_OP_SET_ROWS:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
    case GGML_OP_SILU_BACK:
@@ -9095,7 +9023,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CPY:
-        case GGML_OP_SET_ROWS:
        case GGML_OP_CONT:
        case GGML_OP_DUP:
        case GGML_OP_SILU_BACK:
@@ -9198,20 +9125,12 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_PAD:
        ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);

-        break;
-    case GGML_OP_ROLL:
-        ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
-
        break;
    case GGML_OP_CPY:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
        ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);

-        break;
-    case GGML_OP_SET_ROWS:
-        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun);
-
        break;
    case GGML_OP_SILU_BACK:
        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9426,9 +9345,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
-    case GGML_OP_ROLL:
    case GGML_OP_CPY:
-    case GGML_OP_SET_ROWS:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
    case GGML_OP_SILU_BACK:
@@ -10494,20 +10411,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            } break;
        case GGML_OP_SET_ROWS:
            {
-                switch (op->type) {
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_BF16:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                    case GGML_TYPE_IQ4_NL:
-                        return true;
-                    default:
-                        return false;
-                }
+                // TODO: add support
+                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+                return false;
            } break;
        case GGML_OP_CONT:
        case GGML_OP_CPY:
@@ -10593,12 +10499,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_CLAMP:
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_UPSCALE:
+            return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_ACC:
        case GGML_OP_CONCAT:
        case GGML_OP_SCALE:
        case GGML_OP_PAD:
-        case GGML_OP_ROLL:
        case GGML_OP_DIAG_MASK_INF:
+            return true;
        case GGML_OP_SOFT_MAX:
        case GGML_OP_SOFT_MAX_BACK:
        case GGML_OP_ARGSORT:
@@ -11121,8 +11028,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
        } else {
            tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
        }
-    } else if (tensor->op == GGML_OP_SET_ROWS) {
-        tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]);
    } else if (tensor->op == GGML_OP_CONT) {
        tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    } else if (tensor->op == GGML_OP_RESHAPE) {
@@ -6,25 +6,17 @@ spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bi
 #endif // RTE16

 #include "types.comp"
+#include "generic_unary_head.comp"

-#if defined(SET_ROWS) && QUANT_K == 1
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-const uint BLOCK_SIZE = 512;
+#if defined(DATA_A_IQ4_NL)
+// 16 invocations needed for init_iq4nl_shmem
+layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
 #else
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-const uint BLOCK_SIZE = 32;
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 #endif

 layout (binding = 0) readonly buffer S {float data_s[];};
-
-#if defined(SET_ROWS)
-#include "generic_binary_head.comp"
-layout (binding = 1) readonly buffer C {uvec2 data_i[];};
-layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];};
-#else
-#include "generic_unary_head.comp"
 layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
-#endif

 #if defined(DATA_A_Q4_0)
 void quantize(uint dst_idx, uint src_idx)
@@ -229,56 +221,15 @@ void quantize(uint dst_idx, uint src_idx)
 }
 #endif

-#if defined(DATA_A_F32) || defined(DATA_A_F16)
-void quantize(uint dst_idx, uint src_idx)
-{
-    data_q[dst_idx] = A_TYPE(data_s[src_idx]);
-}
-#endif
-
-#if defined(DATA_A_BF16)
-void quantize(uint dst_idx, uint src_idx)
-{
-    data_q[dst_idx] = A_TYPE(fp32_to_bf16(data_s[src_idx]));
-}
-#endif
-
-#if defined(SET_ROWS)
-
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
-#endif
-
-    const uint idx = ((gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * BLOCK_SIZE + gl_LocalInvocationID.x) * QUANT_K;
-
-    if (idx >= p.ne) {
+    if (gl_LocalInvocationIndex.x != 0) {
        return;
    }
-
-    uint i00, i01, i02, i03;
-    get_indices(idx, i00, i01, i02, i03);
-
-    uint i12 = fastmod(i03, p.ne12);
-    uint i11 = fastmod(i02, p.ne11);
-    uint i10 = i01;
-
-    uint i1 = data_i[src1_idx(i10, i11, i12, 0) + get_boffset()].x;
-
-    uint src0_idx = src0_idx(i00, i01, i02, i03) + get_aoffset();
-    uint dst_idx = dst_idx(i00 / QUANT_K, i1, i02, i03) + get_doffset();
-
-    quantize(dst_idx, src0_idx);
-}
-
-#else
-
-void main() {
-#ifdef NEEDS_INIT_IQ_SHMEM
-    init_iq_shmem(gl_WorkGroupSize);
 #endif

-    const uint idx = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x) * QUANT_K;
+    const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;

    if (idx >= p.ne) {
        return;
@@ -289,5 +240,3 @@ void main() {

    quantize(dst_idx, src_idx);
 }
-
-#endif
@@ -2,9 +2,9 @@

 #extension GL_EXT_control_flow_attributes : enable

-layout(constant_id = 0) const uint BLOCK_SIZE = 32;
+#define BLOCK_SIZE 32

-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

 layout (binding = 0) readonly buffer A {float data_a[];};
 layout (binding = 1) writeonly buffer D {float data_d[];};
@@ -16,8 +16,6 @@ layout (push_constant) uniform parameter {
    uint k_num;
 } p;

-shared float tmpsh[BLOCK_SIZE];
-
 void main() {
    // Each workgroup handles a row
    const uint n = gl_WorkGroupID.x;
@@ -34,51 +32,23 @@ void main() {

    // Compute the max m value for the row
    float m_max = -1.0/0.0;
-    for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) {
-        float m = data_a[m_offset + (k + tid) * lm_stride];
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float m = data_a[m_offset + k * lm_stride];
        m_max = max(m_max, m);
    }

-    // reduce across the workgroup
-    tmpsh[tid] = m_max;
-    barrier();
-    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
-        if (tid < s) {
-            m_max = max(m_max, tmpsh[tid + s]);
-            tmpsh[tid] = m_max;
-        }
-        barrier();
-    }
-    m_max = tmpsh[0];
-
-    barrier();
-
    // Compute L based on m_max
    float L = 0;
-    for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) {
-        float l = data_a[l_offset + (k + tid) * lm_stride];
-        float m = data_a[m_offset + (k + tid) * lm_stride];
+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
+        float l = data_a[l_offset + k * lm_stride];
+        float m = data_a[m_offset + k * lm_stride];
        L += exp(m - m_max) * l;
    }

-    // reduce across the workgroup
-    tmpsh[tid] = L;
-    barrier();
-    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
-        if (tid < s) {
-            L += tmpsh[tid + s];
-            tmpsh[tid] = L;
-        }
-        barrier();
-    }
-    L = tmpsh[0];
-
    L = 1.0 / L;

-    // D dimension is split across workgroups in the y dimension
-    uint d = tid + gl_WorkGroupID.y * BLOCK_SIZE;
    // Scale and sum the O contributions based on m_max and store the result to memory
-    if (d < D) {
+    for (uint d = tid; d < D; d += BLOCK_SIZE) {
        float O = 0.0;
        [[unroll]] for (uint k = 0; k < k_num; ++k) {
            uint o_offset = D * N * (k + iq3 * k_num) + D * n + d;
@@ -18,7 +18,6 @@
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_shader_subgroup_basic : enable
-#extension GL_KHR_shader_subgroup_ballot : enable
 #endif

 #ifdef MUL_MAT_ID
@@ -105,10 +104,6 @@ shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];

 #ifdef MUL_MAT_ID
 shared u16vec2 row_ids[4096];
-uint _ne1;
-#ifdef COOPMAT
-shared uint _ne1_sh;
-#endif
 #endif // MUL_MAT_ID

 #define NUM_WARPS (BLOCK_SIZE / WARP)
@@ -177,47 +172,7 @@ void main() {
    const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;

 #ifdef MUL_MAT_ID
-#ifdef COOPMAT
-    // Spread the search across all elements in the first subgroup
-    if (gl_SubgroupID == 0) {
-        _ne1 = 0;
-        uint num_elements = p.nei1 * p.nei0;
-
-        uint ids[16];
-        uint iter = 0;
-
-        for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
-            // prefetch up to 16 elements
-            if (iter == 0) {
-                [[unroll]] for (uint k = 0; k < 16; ++k) {
-                    uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
-                    bool in_range = i < num_elements;
-                    uint ii1 = i / p.nei0;
-                    uint ii0 = i % p.nei0;
-                    ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
-                }
-            }
-            uint i = j + gl_SubgroupInvocationID;
-            bool in_range = i < num_elements;
-            uint ii1 = i / p.nei0;
-            uint ii0 = i % p.nei0;
-            uint id = ids[iter++];
-            uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
-            uint idx = subgroupBallotExclusiveBitCount(ballot);
-            if (in_range && id == expert_idx) {
-                row_ids[_ne1 + idx] = u16vec2(ii0, ii1);
-            }
-            _ne1 += subgroupBallotBitCount(ballot);
-            iter &= 15;
-        }
-        _ne1_sh = _ne1;
-    }
-
-    barrier();
-
-    _ne1 = _ne1_sh;
-#else
-    _ne1 = 0;
+    uint _ne1 = 0;
    for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
        for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
            if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
@@ -228,7 +183,6 @@ void main() {
    }

    barrier();
-#endif

    // Workgroup has no work
    if (ic * BN >= _ne1) return;
@@ -162,32 +162,17 @@ void main() {
        _ne1 = 0;
        uint num_elements = p.nei1 * p.nei0;

-        uint ids[16];
-        uint iter = 0;
-
-        for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
-            // prefetch up to 16 elements
-            if (iter == 0) {
-                [[unroll]] for (uint k = 0; k < 16; ++k) {
-                    uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
-                    bool in_range = i < num_elements;
-                    uint ii1 = i / p.nei0;
-                    uint ii0 = i % p.nei0;
-                    ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
-                }
-            }
-            uint i = j + gl_SubgroupInvocationID;
+        for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) {
            bool in_range = i < num_elements;
-            uint ii1 = i / p.nei0;
            uint ii0 = i % p.nei0;
-            uint id = ids[iter++];
+            uint ii1 = i / p.nei0;
+            uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
            uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
            uint idx = subgroupBallotExclusiveBitCount(ballot);
            if (in_range && id == expert_idx) {
                row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
            }
            _ne1 += subgroupBallotBitCount(ballot);
-            iter &= 15;
        }
        _ne1_sh = _ne1;
    }
@@ -429,31 +414,17 @@ void main() {
                fetch_scales(ir * BM, pos_a, stride_a, block_k + BK, tid, false);
            }

-            if ((ir + 1) * BM <= p.M && block_k + BK <= end_k) {
-                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
-                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
+            coopmat<MAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
+            coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;

-                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
+            coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
 #ifdef MUL_MAT_ID
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
+            coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
 #else
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
+            coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
 #endif

-                sum = coopMatMulAdd(mat_a, mat_b, sum);
-            } else {
-                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
-                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
-
-                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
-#ifdef MUL_MAT_ID
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
-#else
-                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
-#endif
-
-                sum = coopMatMulAdd(mat_a, mat_b, sum);
-            }
+            sum = coopMatMulAdd(mat_a, mat_b, sum);
        }

        // Convert from ACC_TYPE to D_TYPE
@@ -1,46 +0,0 @@
-#version 450
-
-#include "types.comp"
-#include "generic_unary_head.comp"
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-uint wrap_idx(int i, uint ne) {
-    if (i < 0) {
-        return i + ne;
-    } else if (i >= ne) {
-        return i - ne;
-    }
-    return i;
-}
-
-void main() {
-    const uint idx = get_idx();
-    if (idx >= p.ne) {
-        return;
-    }
-
-    const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
-    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
-    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L);
-    const uint i2_offset = i2*p.ne11*p.ne10;
-    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L);
-    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
-
-    const uint p1 = floatBitsToUint(p.param1);
-    const uint p2 = floatBitsToUint(p.param2);
-    const int s0 = int(p1 >> 16)    - 0x8000;
-    const int s1 = int(p1 & 0xFFFF) - 0x8000;
-    const int s2 = int(p2 >> 16)    - 0x8000;
-    const int s3 = int(p2 & 0xFFFF) - 0x8000;
-
-    const uint i00 = wrap_idx(int(i0) - s0, p.ne10);
-    const uint i01 = wrap_idx(int(i1) - s1, p.ne11);
-    const uint i02 = wrap_idx(int(i2) - s2, p.ne12);
-    const uint i03 = wrap_idx(int(i3) - s3, p.ne13);
-
-    const uint a_idx = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
-    const uint d_idx = i3 *p.nb13 + i2 *p.nb12 + i1 *p.nb11 + i0 *p.nb10;
-
-    data_d[get_doffset() + d_idx] = D_TYPE(data_a[get_aoffset() + a_idx]);
-}
@@ -14,19 +14,21 @@ void main() {

    const uint row_dst = gl_GlobalInvocationID.x;

+    if (i0 >= p.n_dims) {
+        const uint i = row_dst*ne0 + i0;
+
+        data_d[i + 0] = data_a[i + 0];
+        data_d[i + 1] = data_a[i + 1];
+
+        return;
+    }
+
    const uint row_x     = row_dst % ne1;
    const uint channel_x = row_dst / ne1;

    const uint idst = row_dst*ne0 + i0/2;
    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;

-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
-        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
-
-        return;
-    }
-
    const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
    const int sec_w = p.sections[1] + p.sections[0];
    const uint sector = (i0 / 2) % sect_dims;
@@ -13,19 +13,21 @@ void main() {

    const uint row_dst = gl_GlobalInvocationID.x;

+    if (i0 >= p.n_dims) {
+        const uint i = row_dst*ne0 + i0;
+
+        data_d[i + 0] = data_a[i + 0];
+        data_d[i + 1] = data_a[i + 1];
+
+        return;
+    }
+
    const uint row_x     = row_dst % ne1;
    const uint channel_x = row_dst / ne1;

    const uint idst = row_dst*ne0 + i0/2;
    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;

-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
-        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
-
-        return;
-    }
-
    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);

    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
@@ -13,19 +13,21 @@ void main() {

    const uint row_dst = gl_GlobalInvocationID.x;

+    if (i0 >= p.n_dims) {
+        const uint i = row_dst*ne0 + i0;
+
+        data_d[i + 0] = data_a[i + 0];
+        data_d[i + 1] = data_a[i + 1];
+
+        return;
+    }
+
    const uint row_x     = row_dst % ne1;
    const uint channel_x = row_dst / ne1;

    const uint idst = row_dst*ne0 + i0;
    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;

-    if (i0 >= p.n_dims) {
-        data_d[idst + 0] = data_a[ix + 0];
-        data_d[idst + 1] = data_a[ix + 1];
-
-        return;
-    }
-
    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);

    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
@@ -18,7 +18,7 @@ void main() {
            continue;
        }

-        data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1) + FLOAT_TYPE(p.param2));
+        data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
        idx += num_threads;
    }
 }
@@ -3,7 +3,6 @@
 layout (push_constant) uniform parameter
 {
    uint ne; uint a_offset; uint d_offset;
-    uint ne00; uint ne01;
    uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13;
    float sf0; float sf1; float sf2; float sf3;
@@ -16,61 +15,6 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

-// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
-#define NEAREST  0
-#define BILINEAR 1
-#define ALIGN_CORNERS (1 << 8)
-
-layout (constant_id = 0) const uint scale_mode = 0;
-
-float fetch_nearest(uint i10, uint i11, uint i12, uint i13) {
-    const uint i00 = uint(i10 / p.sf0);
-    const uint i01 = uint(i11 / p.sf1);
-    const uint i02 = uint(i12 / p.sf2);
-    const uint i03 = uint(i13 / p.sf3);
-
-    return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00];
-}
-
-float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
-    const uint i02 = uint(i12 / p.sf2);
-    const uint i03 = uint(i13 / p.sf3);
-    const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;
-
-    const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00];
-    const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00];
-    const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00];
-    const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00];
-
-    return
-        v00 * (1.0-d.x) * (1.0-d.y) +
-        v01 * d.x       * (1.0-d.y) +
-        v10 * (1.0-d.x) * d.y +
-        v11 * d.x       * d.y;
-}
-
-float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
-    const ivec2 ne0 = ivec2(p.ne00, p.ne01);
-
-    const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5;
-    const vec2 c0f = floor(c);
-    const vec2 d = c - c0f;
-    const ivec2 c0 = max(ivec2(c0f), 0);
-    const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1);
-
-    return fetch_bilinear(c0, c1, d, i12, i13);
-}
-
-float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) {
-    const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1);
-    const vec2 c0f = floor(c);
-    const vec2 d = c - c0f;
-    const ivec2 c0 = ivec2(c0f);
-    const ivec2 c1 = c0 + 1;
-
-    return fetch_bilinear(c0, c1, d, i12, i13);
-}
-
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

@@ -83,18 +27,10 @@ void main() {
    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;

-    float result;
-    switch (scale_mode) {
-        case NEAREST:
-            result = fetch_nearest(i10, i11, i12, i13);
-            break;
-        case BILINEAR:
-            result = interpolate_bilinear(i10, i11, i12, i13);
-            break;
-        case BILINEAR | ALIGN_CORNERS:
-            result = interpolate_bilinear_align_corners(i10, i11, i12, i13);
-            break;
-    }
+    const uint i00 = uint(i10 / p.sf0);
+    const uint i01 = uint(i11 / p.sf1);
+    const uint i02 = uint(i12 / p.sf2);
+    const uint i03 = uint(i13 / p.sf3);

-    data_d[p.d_offset + idx] = D_TYPE(result);
+    data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
 }
@@ -518,11 +518,6 @@ void process_shaders() {
        string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }

-    for (std::string t : {"f32", "f16", "bf16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
-        string_to_spv("set_rows_" + t, "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-        string_to_spv("set_rows_" + t + "_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
-    }
-
    auto get_type_str = [](bool f16) {
        return f16 ? "float16_t" : "float";
    };
@@ -653,8 +648,6 @@ void process_shaders() {
    string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
    string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));

-    string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
-
    for (auto &c : compiles) {
        c.wait();
    }
@@ -3069,14 +3069,12 @@ static struct ggml_tensor * ggml_scale_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 s,
-        float                 b,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_padded_1d(a));

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

-    float params[2] = { s, b };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, &s, sizeof(s));

    result->op     = GGML_OP_SCALE;
    result->src[0] = a;
@@ -3088,30 +3086,14 @@ struct ggml_tensor * ggml_scale(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 s) {
-    return ggml_scale_impl(ctx, a, s, 0.0, false);
+    return ggml_scale_impl(ctx, a, s, false);
 }

 struct ggml_tensor * ggml_scale_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 s) {
-    return ggml_scale_impl(ctx, a, s, 0.0, true);
-}
-
-struct ggml_tensor * ggml_scale_bias(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        float                 b) {
-    return ggml_scale_impl(ctx, a, s, b, false);
-}
-
-struct ggml_tensor * ggml_scale_bias_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        float                 b) {
-    return ggml_scale_impl(ctx, a, s, b, true);
+    return ggml_scale_impl(ctx, a, s, true);
 }

 // ggml_set
@@ -5795,7 +5777,7 @@ static void ggml_compute_backward(
        } break;
        case GGML_OP_MEAN: {
            if (src0_needs_grads) {
-                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], 0.0, false));
+                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
            }
        } break;
        case GGML_OP_REPEAT: {
@@ -5872,7 +5854,7 @@ static void ggml_compute_backward(
            if (src0_needs_grads) {
                float s;
                memcpy(&s, tensor->op_params, sizeof(float));
-                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, 0.0, false));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
            }
        } break;
        case GGML_OP_SET: {
@@ -631,14 +631,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
                gguf_free(ctx);
                return nullptr;
            }
-            size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
-            if (SIZE_MAX - ctx->size < padded_size) {
-                GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n",
-                    __func__, ti.t.name, ctx->size, padded_size);
-                gguf_free(ctx);
-                return nullptr;
-            }
-            ctx->size += padded_size;
+            ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
        }
    }

@@ -187,9 +187,6 @@ class Keys:
    class Classifier:
        OUTPUT_LABELS = "{arch}.classifier.output_labels"

-    class ShortConv:
-        L_CACHE = "{arch}.shortconv.l_cache"
-
    class Tokenizer:
        MODEL                = "tokenizer.ggml.model"
        PRE                  = "tokenizer.ggml.pre"
@@ -291,7 +288,6 @@ class MODEL_ARCH(IntEnum):
    LLAMA4           = auto()
    DECI             = auto()
    FALCON           = auto()
-    FALCON_H1        = auto()
    BAICHUAN         = auto()
    GROK             = auto()
    GPT2             = auto()
@@ -333,7 +329,6 @@ class MODEL_ARCH(IntEnum):
    ARWKV7           = auto()
    MAMBA            = auto()
    MAMBA2           = auto()
-    JAMBA            = auto()
    XVERSE           = auto()
    COMMAND_R        = auto()
    COHERE2          = auto()
@@ -355,7 +350,6 @@ class MODEL_ARCH(IntEnum):
    EXAONE           = auto()
    GRANITE          = auto()
    GRANITE_MOE      = auto()
-    GRANITE_HYBRID   = auto()
    CHAMELEON        = auto()
    WAVTOKENIZER_DEC = auto()
    PLM              = auto()
@@ -363,9 +357,6 @@ class MODEL_ARCH(IntEnum):
    DOTS1            = auto()
    ARCEE            = auto()
    ERNIE4_5         = auto()
-    HUNYUAN_MOE      = auto()
-    SMOLLM3          = auto()
-    LFM2             = auto()


 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -438,10 +429,7 @@ class MODEL_TENSOR(IntEnum):
    SSM_CONV1D           = auto()
    SSM_X                = auto()
    SSM_DT               = auto()
-    SSM_DT_NORM          = auto()
    SSM_A                = auto()
-    SSM_B_NORM           = auto()
-    SSM_C_NORM           = auto()
    SSM_D                = auto()
    SSM_NORM             = auto()
    SSM_OUT              = auto()
@@ -537,9 +525,6 @@ class MODEL_TENSOR(IntEnum):
    POSNET_ATTN_K        = auto()
    POSNET_ATTN_V        = auto()
    POSNET_ATTN_OUT      = auto()
-    SHORTCONV_CONV       = auto()
-    SHORTCONV_INPROJ     = auto()
-    SHORTCONV_OUTPROJ    = auto()
    # vision
    V_MMPROJ             = auto()
    V_MMPROJ_FC          = auto()
@@ -647,7 +632,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.ARWKV7:           "arwkv7",
    MODEL_ARCH.MAMBA:            "mamba",
    MODEL_ARCH.MAMBA2:           "mamba2",
-    MODEL_ARCH.JAMBA:            "jamba",
    MODEL_ARCH.XVERSE:           "xverse",
    MODEL_ARCH.COMMAND_R:        "command-r",
    MODEL_ARCH.COHERE2:          "cohere2",
@@ -669,7 +653,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.EXAONE:           "exaone",
    MODEL_ARCH.GRANITE:          "granite",
    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
-    MODEL_ARCH.GRANITE_HYBRID:   "granitehybrid",
    MODEL_ARCH.CHAMELEON:        "chameleon",
    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
    MODEL_ARCH.PLM:              "plm",
@@ -677,10 +660,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.DOTS1:            "dots1",
    MODEL_ARCH.ARCEE:            "arcee",
    MODEL_ARCH.ERNIE4_5:         "ernie4_5",
-    MODEL_ARCH.FALCON_H1:        "falcon-h1",
-    MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
-    MODEL_ARCH.SMOLLM3:          "smollm3",
-    MODEL_ARCH.LFM2:             "lfm2",
 }

 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -753,10 +732,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.SSM_CONV1D:                "blk.{bid}.ssm_conv1d",
    MODEL_TENSOR.SSM_X:                     "blk.{bid}.ssm_x",
    MODEL_TENSOR.SSM_DT:                    "blk.{bid}.ssm_dt",
-    MODEL_TENSOR.SSM_DT_NORM:               "blk.{bid}.ssm_dt_norm",
    MODEL_TENSOR.SSM_A:                     "blk.{bid}.ssm_a",
-    MODEL_TENSOR.SSM_B_NORM:                "blk.{bid}.ssm_b_norm",
-    MODEL_TENSOR.SSM_C_NORM:                "blk.{bid}.ssm_c_norm",
    MODEL_TENSOR.SSM_D:                     "blk.{bid}.ssm_d",
    MODEL_TENSOR.SSM_NORM:                  "blk.{bid}.ssm_norm",
    MODEL_TENSOR.SSM_OUT:                   "blk.{bid}.ssm_out",
@@ -852,9 +828,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
    MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
    MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
-    MODEL_TENSOR.SHORTCONV_CONV:            "blk.{bid}.shortconv.conv",
-    MODEL_TENSOR.SHORTCONV_INPROJ:          "blk.{bid}.shortconv.in_proj",
-    MODEL_TENSOR.SHORTCONV_OUTPROJ:         "blk.{bid}.shortconv.out_proj",
    # vision
    MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
    MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
@@ -1759,34 +1732,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.SSM_NORM,
        MODEL_TENSOR.SSM_OUT,
    ],
-    MODEL_ARCH.JAMBA: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.SSM_IN,
-        MODEL_TENSOR.SSM_CONV1D,
-        MODEL_TENSOR.SSM_X,
-        MODEL_TENSOR.SSM_DT,
-        MODEL_TENSOR.SSM_DT_NORM,
-        MODEL_TENSOR.SSM_A,
-        MODEL_TENSOR.SSM_B_NORM,
-        MODEL_TENSOR.SSM_C_NORM,
-        MODEL_TENSOR.SSM_D,
-        MODEL_TENSOR.SSM_OUT,
-        MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.FFN_GATE_EXP,
-        MODEL_TENSOR.FFN_DOWN_EXP,
-        MODEL_TENSOR.FFN_UP_EXP,
-    ],
    MODEL_ARCH.XVERSE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -2156,36 +2101,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP_SHEXP,
        MODEL_TENSOR.FFN_DOWN_SHEXP,
    ],
-    MODEL_ARCH.GRANITE_HYBRID: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.SSM_IN,
-        MODEL_TENSOR.SSM_CONV1D,
-        MODEL_TENSOR.SSM_DT,
-        MODEL_TENSOR.SSM_A,
-        MODEL_TENSOR.SSM_D,
-        MODEL_TENSOR.SSM_NORM,
-        MODEL_TENSOR.SSM_OUT,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        # MoE
-        MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.FFN_GATE_EXP,
-        MODEL_TENSOR.FFN_DOWN_EXP,
-        MODEL_TENSOR.FFN_UP_EXP,
-        MODEL_TENSOR.FFN_GATE_SHEXP,
-        MODEL_TENSOR.FFN_UP_SHEXP,
-        MODEL_TENSOR.FFN_DOWN_SHEXP,
-        # Dense
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
    MODEL_ARCH.CHAMELEON: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -2296,95 +2211,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
-    MODEL_ARCH.FALCON_H1: [
-        # Token embedding
-        MODEL_TENSOR.TOKEN_EMBD,
-
-        # Input layernorm
-        MODEL_TENSOR.ATTN_NORM,
-
-        # Attention components
-        MODEL_TENSOR.ATTN_Q,         # Query projection
-        MODEL_TENSOR.ATTN_K,         # Key projection
-        MODEL_TENSOR.ATTN_V,         # Value projection
-        MODEL_TENSOR.ATTN_OUT,       # Output projection
-
-        # SSM components (Mamba2 specific)
-        MODEL_TENSOR.SSM_IN,         # Input projection for SSM
-        MODEL_TENSOR.SSM_CONV1D,     # Convolution layer
-        MODEL_TENSOR.SSM_DT,         # Delta time projection
-        MODEL_TENSOR.SSM_A,          # A parameter (log form)
-        MODEL_TENSOR.SSM_D,          # D parameter
-        MODEL_TENSOR.SSM_NORM,       # Normalization in SSM
-        MODEL_TENSOR.SSM_OUT,        # Output projection
-
-        # Pre-feedforward layernorm
-        MODEL_TENSOR.FFN_PRE_NORM,
-
-        # Feed-forward network components
-        MODEL_TENSOR.FFN_GATE,       # Gate projection (SwiGLU)
-        MODEL_TENSOR.FFN_DOWN,       # Down projection
-        MODEL_TENSOR.FFN_UP,         # Up projection
-
-        # Post-feedforward layernorm
-        MODEL_TENSOR.OUTPUT_NORM,    # Final layer norm
-        MODEL_TENSOR.OUTPUT,         # Output projection (lm_head)
-    ],
-    MODEL_ARCH.HUNYUAN_MOE: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE_EXP,
-        MODEL_TENSOR.FFN_DOWN_EXP,
-        MODEL_TENSOR.FFN_UP_EXP,
-        MODEL_TENSOR.FFN_GATE_SHEXP,
-        MODEL_TENSOR.FFN_DOWN_SHEXP,
-        MODEL_TENSOR.FFN_UP_SHEXP,
-    ],
-    MODEL_ARCH.SMOLLM3: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.LFM2: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.SHORTCONV_CONV,
-        MODEL_TENSOR.SHORTCONV_INPROJ,
-        MODEL_TENSOR.SHORTCONV_OUTPROJ,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.ATTN_NORM, # operator_norm
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-    ],
    # TODO
 }

@@ -648,9 +648,6 @@ class GGUFWriter:
    def add_convnext_block_count(self, length: int) -> None:
        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)

-    def add_shortconv_l_cache(self, length: int) -> None:
-        self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)
-
    def add_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

@@ -13,7 +13,7 @@ class TensorNameMap:
            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
            "transformer.word_embeddings",               # falcon
            "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 granite-hybrid
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert nomic-bert
            "language_model.embedding.word_embeddings",  # persimmon
@@ -50,7 +50,6 @@ class TensorNameMap:
            "model.pre_ln",               # rwkv7
            "model.layers.0.pre_norm",    # rwkv7
            "backbone.norm",              # wavtokenizer
-            "model.embedding_norm",       # lfm2
        ),

        # Position embeddings
@@ -119,7 +118,7 @@ class TensorNameMap:
            "transformer.h.{bid}.input_layernorm",                  # falcon7b
            "h.{bid}.input_layernorm",                              # bloom
            "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe granite-hybrid
+            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe
            "layers.{bid}.attention_norm",                          # llama-pth
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
@@ -137,7 +136,6 @@ class TensorNameMap:
            "model.layers.{bid}.ln1",                               # rwkv7
            "model.layers.{bid}.input_layernorm",                   # llama4
            "transformer_encoder.{bid}.attention_norm",             # neobert
-            "model.layers.{bid}.operator_norm",                     # lfm2
        ),

        # Attention norm 2
@@ -222,7 +220,6 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.dense",                     # falcon
            "h.{bid}.self_attention.dense",                                 # bloom
            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2 phimoe
-            "model.layers.{bid}.self_attn.out_proj",                        # lfm2
            "model.layers.{bid}.self_attn.linear_attn",                     # deci
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
@@ -282,8 +279,6 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
            "transformer.layers.{bid}.ffn_norm",                             # openelm
-            "model.layers.{bid}.pre_ff_layernorm",                           # jamba granite-hybrid
-            "model.layers.{bid}.pre_moe_layernorm",                          # mini-jamba
            "model.layers.{bid}.post_attention_layernorm",                   # llama4
            "transformer_encoder.{bid}.ffn_norm",                            # neobert
        ),
@@ -291,14 +286,12 @@ class TensorNameMap:
        # Post feed-forward norm
        MODEL_TENSOR.FFN_PRE_NORM: (
            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
-            "model.layers.{bid}.pre_ff_layernorm.weight",
        ),

        # Post feed-forward norm
        MODEL_TENSOR.FFN_POST_NORM: (
            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
            "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
-            "model.layers.{bid}.feed_forward.up_proj",
        ),

        MODEL_TENSOR.FFN_GATE_INP: (
@@ -308,9 +301,8 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.router",           # Grok
            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
-            "model.layers.{bid}.feed_forward.router",           # llama4 jamba
+            "model.layers.{bid}.feed_forward.router",           # llama4
            "encoder.layers.{bid}.mlp.router.layer",            # nomic-bert-moe
-            "model.layers.{bid}.mlp.gate.wg",                   # hunyuan
        ),

        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -352,7 +344,7 @@ class TensorNameMap:
            "model.layers.{bid}.residual_mlp.w3",                     # arctic
            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
-            "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba granite-hybrid
+            "model.layers.{bid}.feed_forward.up_proj",                # llama4
            "transformer_encoder.{bid}.ffn.w12",                      # neobert
        ),

@@ -370,8 +362,6 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_expert.up_proj",          # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.up_proj",         # deepseek deepseek2
            "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
-            "model.layers.{bid}.feed_forward.down_proj",
-            "model.layers.{bid}.mlp.shared_mlp.up_proj",             # hunyuan
        ),

        # AWQ-activation gate
@@ -392,7 +382,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.linear_1",           # refact
            "model.layers.{bid}.residual_mlp.w1",         # arctic
            "transformer.h.{bid}.mlp.c_fc_0",             # exaone
-            "model.layers.{bid}.feed_forward.gate_proj",  # llama4 jamba granite-hybrid
+            "model.layers.{bid}.feed_forward.gate_proj",  # llama4
        ),

        MODEL_TENSOR.FFN_GATE_EXP: (
@@ -408,7 +398,6 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_expert.gate_proj",          # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.gate_proj",         # deepseek deepseek2
            "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
-            "model.layers.{bid}.mlp.shared_mlp.gate_proj",             # hunyuan
        ),

        # Feed-forward down
@@ -438,7 +427,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
            "model.layers.h.{bid}.mlp.c_proj",                        # exaone
-            "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba granite-hybrid
+            "model.layers.{bid}.feed_forward.down_proj",              # llama4
            "transformer_encoder.{bid}.ffn.w3",                       # neobert
        ),

@@ -458,13 +447,11 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_experts.down_proj",         # deepseek deepseek2
            "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
            "model.layers.{bid}.shared_mlp.output_linear",             # granitemoe
-            "model.layers.{bid}.mlp.shared_mlp.down_proj",             # hunyuan
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.query_layernorm",                   # hunyuan
            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
@@ -474,7 +461,6 @@ class TensorNameMap:
        MODEL_TENSOR.ATTN_K_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.key_layernorm",                     # hunyuan
            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
@@ -559,64 +545,42 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.SSM_IN: (
-            "model.layers.{bid}.in_proj",           # mamba-hf
-            "backbone.layers.{bid}.mixer.in_proj",  # mamba
-            "model.layers.{bid}.mamba.in_proj",     # jamba falcon-h1 granite-hybrid
+            "model.layers.{bid}.in_proj",
+            "backbone.layers.{bid}.mixer.in_proj",
        ),

        MODEL_TENSOR.SSM_CONV1D: (
-            "model.layers.{bid}.conv1d",           # mamba-hf
-            "backbone.layers.{bid}.mixer.conv1d",  # mamba
-            "model.layers.{bid}.mamba.conv1d",     # jamba falcon-h1 granite-hybrid
+            "model.layers.{bid}.conv1d",
+            "backbone.layers.{bid}.mixer.conv1d",
        ),

        MODEL_TENSOR.SSM_X: (
-            "model.layers.{bid}.x_proj",           # mamba-hf
-            "backbone.layers.{bid}.mixer.x_proj",  # mamba
-            "model.layers.{bid}.mamba.x_proj",     # jamba
+            "model.layers.{bid}.x_proj",
+            "backbone.layers.{bid}.mixer.x_proj",
        ),

        MODEL_TENSOR.SSM_DT: (
-            "model.layers.{bid}.dt_proj",           # mamba-hf
-            "backbone.layers.{bid}.mixer.dt_proj",  # mamba
-            "model.layers.{bid}.mamba.dt_proj",     # jamba falcon-h1 granite-hybrid
-        ),
-
-        MODEL_TENSOR.SSM_DT_NORM: (
-            "model.layers.{bid}.mamba.dt_layernorm",  # jamba
+            "model.layers.{bid}.dt_proj",
+            "backbone.layers.{bid}.mixer.dt_proj",
        ),

        MODEL_TENSOR.SSM_A: (
-            "model.layers.{bid}.A_log",           # mamba-hf
-            "backbone.layers.{bid}.mixer.A_log",  # mamba
-            "model.layers.{bid}.mamba.A_log",     # jamba falcon-h1 granite-hybrid
-        ),
-
-        MODEL_TENSOR.SSM_B_NORM: (
-            "model.layers.{bid}.mamba.b_layernorm",  # jamba
-            "model.layers.{bid}.mamba.B_layernorm",  # mini-jamba
-        ),
-
-        MODEL_TENSOR.SSM_C_NORM: (
-            "model.layers.{bid}.mamba.c_layernorm",  # jamba
-            "model.layers.{bid}.mamba.C_layernorm",  # mini-jamba
+            "model.layers.{bid}.A_log",
+            "backbone.layers.{bid}.mixer.A_log",
        ),

        MODEL_TENSOR.SSM_D: (
-            "model.layers.{bid}.D",           # mamba-hf
-            "backbone.layers.{bid}.mixer.D",  # mamba
-            "model.layers.{bid}.mamba.D",     # jamba falcon-h1 granite-hybrid
+            "model.layers.{bid}.D",
+            "backbone.layers.{bid}.mixer.D",
        ),

        MODEL_TENSOR.SSM_NORM: (
-            "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
            "backbone.layers.{bid}.mixer.norm",  # mamba2
        ),

        MODEL_TENSOR.SSM_OUT: (
-            "model.layers.{bid}.out_proj",           # mamba-hf
-            "backbone.layers.{bid}.mixer.out_proj",  # mamba
-            "model.layers.{bid}.mamba.out_proj",     # jamba falcon-h1 granite-hybrid
+            "model.layers.{bid}.out_proj",
+            "backbone.layers.{bid}.mixer.out_proj",
        ),

        MODEL_TENSOR.TIME_MIX_W0: (
@@ -1018,18 +982,6 @@ class TensorNameMap:
            "backbone.posnet.{bid}.proj_out", # wavtokenizer
        ),

-        MODEL_TENSOR.SHORTCONV_CONV: (
-            "model.layers.{bid}.conv.conv",
-        ),
-
-        MODEL_TENSOR.SHORTCONV_INPROJ: (
-            "model.layers.{bid}.conv.in_proj",
-        ),
-
-        MODEL_TENSOR.SHORTCONV_OUTPROJ: (
-            "model.layers.{bid}.conv.out_proj",
-        ),
-
        #############################################################################
        ## Vision encoder

@@ -79,6 +79,46 @@ extern "C" {
        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
    };

+    // pre-tokenization types
+    enum llama_vocab_pre_type {
+        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
+        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
+        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
+        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
+        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
+        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
+        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
+        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
+        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
+        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
+        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
+        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
+        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
+        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
+        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
+        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
+        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
+    };
+
    enum llama_rope_type {
        LLAMA_ROPE_TYPE_NONE   = -1,
        LLAMA_ROPE_TYPE_NORM   = 0,
@@ -1,34 +0,0 @@
-{%- if not add_generation_prompt is defined -%}
-    {%- set add_generation_prompt = true -%}
-{%- endif -%}
-{%- set ns = namespace(system_prompt='') -%}
-{%- for message in messages -%}
-    {%- if message['role'] == 'system' -%}
-        {%- set ns.system_prompt = message['content'] -%}
-    {%- endif -%}
-{%- endfor -%}
-{{bos_token}}
-{%- if ns.system_prompt != '' -%}
-{{- 'System: ' + ns.system_prompt + '\n\n' -}}
-{%- endif -%}
-{%- for message in messages -%}
-    {%- if message['role'] == 'user' -%}
-        {{- 'User: ' + message['content']|trim + '\n\n' -}}
-    {%- endif -%}
-    {%- if message['role'] == 'assistant' and message['content'] is  not none -%}
-        {%- set content = message['content'] -%}
-        {%- if '</think>' in content -%}
-            {%- set content = content.split('</think>')[-1] -%}
-        {%- endif -%}
-        {{- 'Assistant: ' + content|trim + '\n\n' -}}
-    {%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-    {{- 'Assistant:' -}}
-    {%- if enable_thinking is defined and enable_thinking is false %}
-        {{- ' <think>\n</think>' }}
-    {%- endif %}
-    {%- if enable_thinking is defined and enable_thinking is true %}
-        {{- ' <think>' }}
-    {%- endif %}
-{%- endif -%}
@@ -3,7 +3,6 @@
 -r ../tools/server/tests/requirements.txt

 -r ./requirements-compare-llama-bench.txt
-r ./requirements-server-bench.txt
 -r ./requirements-pydantic.txt
 -r ./requirements-test-tokenizer-random.txt

@@ -1,5 +0,0 @@
-datasets~=3.2.0
-matplotlib~=3.10.0
-numpy~=1.26.4
-requests~=2.32.3
-tqdm~=4.67.1
@@ -1,196 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script parses docs/ops/*.csv and creates the ops.md, which is a table documenting supported operations on various ggml backends.
-"""
-import csv
-import logging
-import sys
-from pathlib import Path
-from collections import defaultdict
-
-
-class DocsGenerator:
-    def __init__(self, ggml_root: str, output_filename: str = "ops.md"):
-        self.ggml_root = Path(ggml_root)
-        self.ops_dir = self.ggml_root / "docs" / "ops"
-        self.output_filename = output_filename
-        self.backend_support: dict[str, dict[str, list[bool]]] = defaultdict(
-            lambda: defaultdict(list)
-        )
-        self.all_operations: set[str] = set()
-        self.all_backends: set[str] = set()
-        self.logger = logging.getLogger(__name__)
-
-    def parse_support_files(self) -> None:
-        if not self.ops_dir.exists():
-            self.logger.warning(f"ops directory not found: {self.ops_dir}")
-            return
-
-        self.logger.info(f"Parsing support files from {self.ops_dir}...")
-
-        for support_file in self.ops_dir.glob("*.csv"):
-            self.logger.info(f"  Reading: {support_file.name}")
-            self._parse_support_file(support_file)
-
-    def _parse_support_file(self, file_path: Path) -> None:
-        try:
-            with open(file_path, "r", newline='') as f:
-                reader = csv.DictReader(f)
-
-                for row in reader:
-                    # Skip rows that don't have support mode
-                    if row.get('test_mode') != 'support':
-                        continue
-
-                    backend_name = row.get('backend_name', '').strip()
-                    operation = row.get('op_name', '').strip()
-                    supported_str = row.get('error_message', '').strip()  # "yes" or "no"
-                    backend_reg_name = row.get('backend_reg_name', '').strip()
-
-                    # Skip invalid or error operations
-                    if not operation or not backend_name or operation in [
-                        "CONTEXT_ERROR",
-                        "BUILD_ERROR",
-                    ]:
-                        continue
-
-                    is_supported = supported_str.lower() == "yes"
-
-                    # Use backend_reg_name for grouping, fallback to backend_name
-                    backend_key = backend_reg_name if backend_reg_name else backend_name
-
-                    self.all_backends.add(backend_key)
-                    self.backend_support[backend_key][operation].append(is_supported)
-                    self.all_operations.add(operation)
-
-        except Exception as e:
-            self.logger.error(f"    Error parsing {file_path}: {e}")
-
-    def get_backend_support_status(self, backend: str, operation: str) -> str:
-        support_list = self.backend_support[backend].get(operation, [])
-
-        if not support_list:
-            return "unsupported"
-
-        all_supported = all(support_list)
-        any_supported = any(support_list)
-
-        if all_supported:
-            return "supported"
-        elif any_supported:
-            return "partially supported"
-        else:
-            return "unsupported"
-
-    def get_support_status(self, operation: str) -> str:
-        if operation not in self.all_operations:
-            return "unsupported"
-
-        support_count = 0
-        total_backends = len(self.all_backends)
-
-        for backend in self.all_backends:
-            if self.backend_support[backend].get(operation, False):
-                support_count += 1
-
-        if support_count == 0:
-            return "unsupported"
-        elif support_count == total_backends:
-            return "supported"
-        else:
-            return "partially supported"
-
-    def get_support_symbol(self, status: str) -> str:
-        symbols = {"supported": "✅", "partially supported": "🟡", "unsupported": "❌"}
-        return symbols.get(status, "❓")
-
-    def generate_markdown(self) -> str:
-        lines = []
-
-        lines.append("# GGML Operations")
-        lines.append("")
-        lines.append("List of GGML operations and backend support status.")
-        lines.append("")
-        lines.append("Legend:")
-        lines.append("- ✅ Fully supported by this backend")
-        lines.append("- 🟡 Partially supported by this backend")
-        lines.append("- ❌ Not supported by this backend")
-        lines.append("")
-
-        backends = sorted(self.all_backends)
-        header = "| Operation |"
-        for backend in backends:
-            header += f" {backend} |"
-
-        separator = "|-----------|"
-        for _ in backends:
-            separator += "------|"
-
-        lines.append(header)
-        lines.append(separator)
-
-        sorted_operations = sorted(self.all_operations)
-
-        for operation in sorted_operations:
-            row = f"| {operation:>32} |"
-
-            for backend in backends:
-                status = self.get_backend_support_status(backend, operation)
-                if status == "supported":
-                    symbol = "✅"
-                elif status == "partially supported":
-                    symbol = "🟡"
-                else:
-                    symbol = "❌"
-                row += f" {symbol} |"
-
-            lines.append(row)
-
-        lines.append("")
-
-        return "\n".join(lines)
-
-    def run(self) -> None:
-        self.logger.info("Parsing GGML operation support files...")
-        self.parse_support_files()
-
-        if not self.all_operations:
-            self.logger.error(
-                "No operations found. Make sure to run test-backend-ops support --output csv > docs/ops/file.csv first."
-            )
-            return
-
-        self.logger.info(
-            f"Found {len(self.all_operations)} operations across {len(self.all_backends)} backends"
-        )
-
-        self.logger.info("Generating markdown...")
-        markdown_content = self.generate_markdown()
-
-        docs_dir = self.ggml_root / "docs"
-        docs_dir.mkdir(exist_ok=True)
-
-        ops_file = docs_dir / self.output_filename
-        with open(ops_file, "w") as f:
-            f.write(markdown_content)
-
-        self.logger.info(f"Generated: {ops_file}")
-        self.logger.info(f"Operations: {len(self.all_operations)}")
-        self.logger.info(f"Backends: {len(self.all_backends)}")
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    if len(sys.argv) > 1:
-        output_filename = sys.argv[1]
-    else:
-        output_filename = "ops.md"
-
-    generator = DocsGenerator(".", output_filename)
-    generator.run()
-
-
-if __name__ == "__main__":
-    main()
@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import subprocess
-from time import sleep, time
-from typing import Optional
-
-import datasets
-import logging
-import matplotlib.pyplot as plt
-import numpy as np
-import requests
-from tqdm.contrib.concurrent import thread_map
-
-
-logging.basicConfig(level=logging.INFO, format='%(message)s')
-logger = logging.getLogger("server-bench")
-
-
-def get_prompts(n_prompts: int) -> list[str]:
-    logger.info("Loading MMLU dataset...")
-    ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]  # type: ignore
-    if n_prompts >= 0:
-        ret = ret[:n_prompts]
-    return ret
-
-
-def get_server(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int) -> dict:
-    logger.info("Starting the llama.cpp server...")
-    address = f"http://localhost:{port}"
-
-    popen_args: list[str] = [
-        path_server,
-        "--flash-attn",
-        "--n-gpu-layers", str(n_gpu_layers),
-        "--parallel", str(parallel),
-        "--ctx-size", str(parallel * ctx_size),
-        "--model", path_model,
-        "--port", str(port),
-        "--swa-full",  # FIXME performance bad otherwise
-        # "--attn-streams",
-    ]
-    fout = open("bench.log", "w") if path_log is not None else subprocess.DEVNULL
-    process = subprocess.Popen(popen_args, stdout=fout, stderr=subprocess.STDOUT)
-
-    n_failures: int = 0
-    while True:
-        try:
-            sleep(1.0)
-            exit_code = process.poll()
-            if exit_code is not None:
-                raise RuntimeError(f"llama.cpp server for {path_model} exited unexpectedly with exit code {exit_code}")
-            response = requests.get(f"{address}/health")
-            if response.status_code == 200:
-                break
-        except requests.ConnectionError:
-            n_failures += 1
-            if n_failures >= 10:
-                raise RuntimeError(f"llama.cpp server for {path_model} is not healthy after 10 seconds")
-
-    return {"process": process, "address": address, "fout": fout}
-
-
-def get_prompt_length(data: dict) -> int:
-    session = data["session"]
-    server_address: str = data["server_address"]
-
-    response = session.post(
-        f"{server_address}/apply-template",
-        json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
-    )
-    if response.status_code != 200:
-        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    prompt: str = json.loads(response.text)["prompt"]
-    response = session.post(
-        f"{server_address}/tokenize",
-        json={"content": prompt, "add_special": True}
-    )
-    if response.status_code != 200:
-        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    tokens: list[str] = json.loads(response.text)["tokens"]
-    return len(tokens)
-
-
-def send_prompt(data: dict) -> tuple[float, list[float]]:
-    session = data["session"]
-    server_address: str = data["server_address"]
-
-    response = session.post(
-        f"{server_address}/apply-template",
-        json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
-    )
-    if response.status_code != 200:
-        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    prompt: str = json.loads(response.text)["prompt"]
-
-    json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
-    response = session.post(f"{server_address}/completion", json=json_data, stream=True)
-
-    last_valid_line: str = ""
-    token_arrival_times: list[float] = []
-    for line in response.iter_lines(decode_unicode=True):
-        if not line.startswith("data: "):
-            continue
-        last_valid_line = line
-        token_arrival_times.append(time())
-    token_arrival_times = token_arrival_times[:-1]
-
-    if response.status_code != 200:
-        raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
-    timings: dict = json.loads(last_valid_line[6:])["timings"]
-
-    return (timings["prompt_ms"], token_arrival_times)
-
-
-def benchmark(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int, n_prompts: int, n_predict: int):
-    num_workers: int = parallel + 1
-    prompts: list[str] = get_prompts(n_prompts)
-
-    server: Optional[dict] = None
-    session = None
-    try:
-        server = get_server(path_server, path_model, path_log, port, n_gpu_layers, parallel, ctx_size)
-        server_address: str = server["address"]
-
-        adapter = requests.adapters.HTTPAdapter(pool_connections=num_workers, pool_maxsize=num_workers)  # type: ignore
-        session = requests.Session()
-        session.mount("http://", adapter)
-        session.mount("https://", adapter)
-
-        data: list[dict] = []
-        for i, p in enumerate(prompts):
-            data.append({"session": session, "server_address": server_address, "prompt": p, "n_predict": n_predict, "seed": i})
-
-        logger.info("Getting the prompt lengths...")
-        prompt_n = [get_prompt_length(d) for d in data]
-
-        logger.info("Starting the benchmark...\n")
-        t0 = time()
-        results: list[tuple[int, list[float]]] = thread_map(send_prompt, data, max_workers=num_workers, chunksize=1)
-    finally:
-        if server is not None:
-            server["process"].terminate()
-            server["process"].wait()
-        if session is not None:
-            session.close()
-
-    prompt_ms = []
-    token_t = []
-    depth_sum: int = 0
-    for pn, (pms, tat) in zip(prompt_n, results):
-        prompt_ms.append(pms)
-        token_t += tat
-        n_tokens: int = len(tat)
-        depth_sum += n_tokens * pn
-        depth_sum += n_tokens * (n_tokens + 1) // 2
-    prompt_n = np.array(prompt_n, dtype=np.int64)
-    prompt_ms = np.array(prompt_ms, dtype=np.float64)
-    token_t = np.array(token_t, dtype=np.float64)
-
-    token_t -= t0
-    token_t_last = np.max(token_t)
-
-    logger.info("")
-    logger.info(f"Benchmark duration:                {token_t_last:.2f} s")
-    logger.info(f"Request throughput:                {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
-    logger.info(f"Total prompt length:               {np.sum(prompt_n)} tokens")
-    logger.info(f"Average prompt length:             {np.mean(prompt_n):.2f} tokens")
-    logger.info(f"Average prompt latency:            {np.mean(prompt_ms):.2f} ms")
-    logger.info(f"Average prompt speed:              {np.sum(prompt_n) / (1e-3 * np.sum(prompt_ms)):.2f} tokens/s")
-    logger.info(f"Total generated tokens:            {token_t.shape[0]}")
-    logger.info(f"Average generation depth:          {depth_sum / token_t.shape[0]:.2f} tokens")
-    logger.info(f"Average total generation speed:    {token_t.shape[0] / token_t_last:.2f} tokens/s")
-    logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
-
-    plt.figure()
-    plt.scatter(prompt_n, prompt_ms, s=10.0, marker=".", alpha=0.25)
-    plt.xlim(0, 1.05 * np.max(prompt_n))
-    plt.ylim(0, 1.05 * np.max(prompt_ms))
-    plt.title(path_model)
-    plt.xlabel("Prompt length [tokens]")
-    plt.ylabel("Time to first token [ms]")
-    plt.savefig("prompt_time.png", dpi=240)
-
-    bin_max = np.ceil(token_t_last) + 1
-    plt.figure()
-    plt.hist(token_t, np.arange(0, bin_max))
-    plt.xlim(0, bin_max + 1)
-    plt.title(path_model)
-    plt.xlabel("Time [s]")
-    plt.ylabel("Num. tokens generated per second")
-    plt.savefig("gen_rate.png", dpi=240)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
-        "Results are printed to console and visualized as plots (saved to current working directory).")
-    parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
-    parser.add_argument("--path_model", type=str, required=True, help="Path to the model to use for the benchmark")
-    parser.add_argument("--path_log", type=str, default=None, help="Path to the model to use for the benchmark")
-    parser.add_argument("--port", type=int, default=18725, help="Port to use for the server during the benchmark")
-    parser.add_argument("--n_gpu_layers", type=int, default=999, help="Number of GPU layers for the server")
-    parser.add_argument("--parallel", type=int, default=16, help="Number of slots for the server")
-    parser.add_argument("--ctx_size", type=int, default=4096, help="Server context size per slot")
-    parser.add_argument("--n_prompts", type=int, default=1000, help="Number of prompts to evaluate")
-    parser.add_argument("--n_predict", type=int, default=2048, help="Max. number of tokens to predict per prompt")
-    args = parser.parse_args()
-    benchmark(**vars(args))
@@ -1 +1 @@
-d62df60a07ba3deeb85e5cfc9b1ee07645ff35e2
+0405219965324e11a29b6aadfe22a6d66131978f
@@ -46,8 +46,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER2,       "starcoder2"       },
    { LLM_ARCH_MAMBA,            "mamba"            },
    { LLM_ARCH_MAMBA2,           "mamba2"           },
-    { LLM_ARCH_JAMBA,            "jamba"            },
-    { LLM_ARCH_FALCON_H1,        "falcon-h1"        },
    { LLM_ARCH_XVERSE,           "xverse"           },
    { LLM_ARCH_COMMAND_R,        "command-r"        },
    { LLM_ARCH_COHERE2,          "cohere2"          },
@@ -73,7 +71,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ARWKV7,           "arwkv7"           },
    { LLM_ARCH_GRANITE,          "granite"          },
    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
-    { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
    { LLM_ARCH_CHAMELEON,        "chameleon"        },
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
@@ -81,9 +78,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_DOTS1,            "dots1"            },
    { LLM_ARCH_ARCEE,            "arcee"            },
    { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
-    { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
-    { LLM_ARCH_SMOLLM3,          "smollm3"          },
-    { LLM_ARCH_LFM2,             "lfm2"             },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -156,6 +150,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
+    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },

    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@@ -189,8 +184,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

-    { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
-
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@@ -1029,61 +1022,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
        },
    },
-    {
-        LLM_ARCH_JAMBA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
-            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
-            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" },
-            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
-            { LLM_TENSOR_SSM_DT_NORM,     "blk.%d.ssm_dt_norm" },
-            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
-            { LLM_TENSOR_SSM_B_NORM,      "blk.%d.ssm_b_norm" },
-            { LLM_TENSOR_SSM_C_NORM,      "blk.%d.ssm_c_norm" },
-            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
-            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-        },
-    },
-    {
-        LLM_ARCH_FALCON_H1,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
-            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
-            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
-            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
-            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
-            { LLM_TENSOR_SSM_NORM,        "blk.%d.ssm_norm" },
-            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
    {
        LLM_ARCH_XVERSE,
        {
@@ -1644,43 +1582,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
-    {
-        LLM_ARCH_GRANITE_HYBRID,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,     "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,    "output_norm" },
-            { LLM_TENSOR_OUTPUT,         "output" },
-            { LLM_TENSOR_ATTN_NORM,      "blk.%d.attn_norm" },
-            // mamba(2) ssm layers
-            { LLM_TENSOR_SSM_IN,         "blk.%d.ssm_in" },
-            { LLM_TENSOR_SSM_CONV1D,     "blk.%d.ssm_conv1d" },
-            { LLM_TENSOR_SSM_DT,         "blk.%d.ssm_dt" },
-            { LLM_TENSOR_SSM_A,          "blk.%d.ssm_a" },
-            { LLM_TENSOR_SSM_D,          "blk.%d.ssm_d" },
-            { LLM_TENSOR_SSM_NORM,       "blk.%d.ssm_norm" },
-            { LLM_TENSOR_SSM_OUT,        "blk.%d.ssm_out" },
-            // attention layers
-            { LLM_TENSOR_ATTN_Q,         "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,         "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,         "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,       "blk.%d.attn_output" },
-            // dense FFN
-            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,       "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,       "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up" },
-            // moe FFN
-            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE_INP,   "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_GATE_EXPS,  "blk.%d.ffn_gate_exps" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,  "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,    "blk.%d.ffn_up_exps" },
-            // shared expert
-            { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
-            { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
-            { LLM_TENSOR_FFN_UP_SHEXP,   "blk.%d.ffn_up_shexp" },
-        },
-    },
    {
        LLM_ARCH_CHAMELEON,
        {
@@ -1793,67 +1694,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
        },
    },
-    {
-        LLM_ARCH_HUNYUAN_MOE,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
-            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
-            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
-            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-        },
-    },
-    {
-        LLM_ARCH_SMOLLM3,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,     "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,    "output_norm" },
-            { LLM_TENSOR_OUTPUT,         "output" },
-            { LLM_TENSOR_ATTN_NORM,      "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,         "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,         "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,         "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,       "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,       "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,       "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_LFM2,
-        {
-            { LLM_TENSOR_ATTN_NORM,         "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,            "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,            "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,            "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,          "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_K_NORM,       "blk.%d.attn_k_norm" },
-            { LLM_TENSOR_ATTN_Q_NORM,       "blk.%d.attn_q_norm" },
-            { LLM_TENSOR_FFN_DOWN,          "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_GATE,          "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_NORM,          "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,            "blk.%d.ffn_up" },
-            { LLM_TENSOR_SHORTCONV_CONV,    "blk.%d.shortconv.conv" },
-            { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
-            { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
-            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
-        }
-    },
    {
        LLM_ARCH_UNKNOWN,
        {
@@ -1938,9 +1778,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_FFN_ACT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
    {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
    {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
-    {LLM_TENSOR_SSM_DT_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_SSM_B_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_SSM_C_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_SSM_D,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_SSM_NORM,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2021,9 +1858,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_CONVNEXT_PW1,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CONVNEXT_PW2,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_SHORTCONV_CONV,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
-    {LLM_TENSOR_SHORTCONV_INPROJ,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_SHORTCONV_OUTPROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 };

 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -2091,12 +1925,9 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
 }

 bool llm_arch_is_hybrid(const llm_arch & arch) {
+    // TODO: There are currently no hybrid models! Once there are, this will be
+    //  the place to identify them
    switch (arch) {
-        case LLM_ARCH_JAMBA:
-        case LLM_ARCH_FALCON_H1:
-        case LLM_ARCH_GRANITE_HYBRID:
-        case LLM_ARCH_LFM2:
-            return true;
        default:
            return false;
    }
@@ -50,8 +50,6 @@ enum llm_arch {
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
    LLM_ARCH_MAMBA2,
-    LLM_ARCH_JAMBA,
-    LLM_ARCH_FALCON_H1,
    LLM_ARCH_XVERSE,
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_COHERE2,
@@ -77,7 +75,6 @@ enum llm_arch {
    LLM_ARCH_ARWKV7,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
-    LLM_ARCH_GRANITE_HYBRID,
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_PLM,
@@ -85,9 +82,6 @@ enum llm_arch {
    LLM_ARCH_DOTS1,
    LLM_ARCH_ARCEE,
    LLM_ARCH_ERNIE4_5,
-    LLM_ARCH_HUNYUAN_MOE,
-    LLM_ARCH_SMOLLM3,
-    LLM_ARCH_LFM2,
    LLM_ARCH_UNKNOWN,
 };

@@ -160,6 +154,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_LAYER_INDICES,

    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -228,8 +223,6 @@ enum llm_kv {

    LLM_KV_CLASSIFIER_OUTPUT_LABELS,

-    LLM_KV_SHORTCONV_L_CACHE,
-
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -300,10 +293,7 @@ enum llm_tensor {
    LLM_TENSOR_SSM_CONV1D,
    LLM_TENSOR_SSM_X,
    LLM_TENSOR_SSM_DT,
-    LLM_TENSOR_SSM_DT_NORM,
    LLM_TENSOR_SSM_A,
-    LLM_TENSOR_SSM_B_NORM,
-    LLM_TENSOR_SSM_C_NORM,
    LLM_TENSOR_SSM_D,
    LLM_TENSOR_SSM_NORM,
    LLM_TENSOR_SSM_OUT,
@@ -399,9 +389,6 @@ enum llm_tensor {
    LLM_TENSOR_POS_NET_ATTN_K,
    LLM_TENSOR_POS_NET_ATTN_V,
    LLM_TENSOR_POS_NET_ATTN_OUT,
-    LLM_TENSOR_SHORTCONV_CONV,
-    LLM_TENSOR_SHORTCONV_INPROJ,
-    LLM_TENSOR_SHORTCONV_OUTPROJ,
 };

 enum llm_tensor_layer {
@@ -64,7 +64,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
-    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -170,7 +169,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
        // EXAONE-3.0-7.8B-Instruct
        return LLM_CHAT_TEMPLATE_EXAONE_3;
-    } else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
+    } else if (tmpl_contains("rwkv-world")) {
        return LLM_CHAT_TEMPLATE_RWKV_WORLD;
    } else if (tmpl_contains("<|start_of_role|>")) {
        return LLM_CHAT_TEMPLATE_GRANITE;
@@ -186,8 +185,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
    } else if (tmpl_contains("<|endofuserprompt|>")) {
        return LLM_CHAT_TEMPLATE_DOTS1;
-    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
-        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -668,18 +665,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|response|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
-        // tencent/Hunyuan-A13B-Instruct
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "system") {
-                ss << "<|startoftext|>" << message->content << "<|extra_4|>";
-            } else if (role == "assistant") {
-                ss << "<|startoftext|>" << message->content << "<|eos|>";
-            } else {
-                ss << "<|startoftext|>" << message->content << "<|extra_0|>";
-            }
-        }
    } else {
        // template not supported
        return -1;
@@ -44,7 +44,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_DOTS1,
-    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

@@ -731,8 +731,7 @@ int llama_context::encode(const llama_batch & batch_inp) {

    const auto & hparams = model.hparams;

-    const int64_t n_embd  = hparams.n_embd;
-    const int32_t n_vocab = model.vocab.n_tokens();
+    const int64_t n_embd = hparams.n_embd;

    // note: during encode, we always pass the full sequence starting from pos = 0
    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
@@ -792,20 +791,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
        }
    }

-    auto * t_logits = res->get_logits();
    auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();

-    // extract logits
-   if (logits && t_logits) {
-        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
-        GGML_ASSERT(backend_res != nullptr);
-        GGML_ASSERT(logits != nullptr);
-
-        ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
-    }
-
    // extract embeddings
-    if (embd && t_embd) {
+    if (t_embd) {
        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
        GGML_ASSERT(backend_embd != nullptr);

@@ -336,8 +336,29 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
 }

 void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
-    inp_attn->set_input(ubatch);
-    inp_rs->set_input(ubatch);
+    mctx->get_attn()->set_input_k_idxs(self_k_idxs, ubatch);
+    mctx->get_attn()->set_input_v_idxs(self_v_idxs, ubatch);
+
+    mctx->get_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+
+    const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+    if (s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
+        int32_t * data = (int32_t *) s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mctx->get_recr()->s_copy(i);
+        }
+    }
+}
+
+void llm_graph_input_one::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+    GGML_ASSERT(one && ggml_nelements(one) == 1);
+    float f_one = 1.0f;
+    ggml_backend_tensor_set(one, &f_one, 0, sizeof(float));
 }

 //
@@ -971,6 +992,35 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
    return pos_bias;
 }

+llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
+
+    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(hparams, cparams, mctx_cur);
+
+    {
+        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers");
+
+        const auto n_kv = inp->mctx->get_attn()->get_n_kv();
+
+        inp->self_k_idxs = mctx_cur->get_attn()->build_input_k_idxs(ctx0, ubatch);
+        inp->self_v_idxs = mctx_cur->get_attn()->build_input_v_idxs(ctx0, ubatch);
+
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    }
+
+    {
+        const auto n_rs = mctx_cur->get_recr()->get_n_rs();
+
+        inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
+        ggml_set_input(inp->s_copy);
+    }
+
+    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
+}
+
 ggml_tensor * llm_graph_context::build_attn_mha(
         ggml_cgraph * gf,
         ggml_tensor * q,
@@ -1144,12 +1194,8 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

-static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unified_impl(
-           ggml_context * ctx0,
-     const llama_ubatch & ubatch,
-    const llama_hparams & hparams,
-    const llama_cparams & cparams,
-    const llama_kv_cache_unified_context * mctx_cur) {
+llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);

    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur);

@@ -1157,7 +1203,6 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");

        const auto n_kv = mctx_cur->get_n_kv();
-        const auto n_tokens = ubatch.n_tokens;

        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
@@ -1168,14 +1213,6 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
    }

-    return inp;
-}
-
-llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
-
-    auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
-
    return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
 }

@@ -1197,7 +1234,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_build_forward_expand(gf, k_cur);
    ggml_build_forward_expand(gf, v_cur);

-    const auto * mctx_cur = inp->mctx;
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);

    // store to KV cache
    {
@@ -1256,7 +1293,7 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_build_forward_expand(gf, v_cur);
    }

-    const auto * mctx_iswa = inp->mctx;
+    const auto * mctx_iswa = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);

    const bool is_swa = hparams.is_swa(il);

@@ -1354,9 +1391,59 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

-// TODO: maybe separate the inner implementation into a separate function
-//       like with the non-sliding window equivalent
-//       once sliding-window hybrid caches are a thing.
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_mem_hybrid * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_attn();
+
+    // store to KV cache
+    {
+        const auto & k_idxs = inp->get_k_idxs();
+        const auto & v_idxs = inp->get_v_idxs();
+
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+        ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il));
+    }
+
+    const auto & kq_mask = inp->get_kq_mask();
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = mctx_cur->get_v(ctx0, il);
+
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
 llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);

@@ -1426,9 +1513,8 @@ ggml_tensor * llm_graph_context::build_rs(
    return output_states;
 }

-static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
-           ggml_context * ctx0,
-    const llama_memory_recurrent_context * mctx_cur) {
+llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);

    auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);

@@ -1437,14 +1523,6 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
    ggml_set_input(inp->s_copy);

-    return inp;
-}
-
-llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
-    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
-
-    auto inp = build_rs_inp_impl(ctx0, mctx_cur);
-
    return (llm_graph_input_rs *) res->add_input(std::move(inp));
 }

@@ -1455,7 +1533,19 @@ ggml_tensor * llm_graph_context::build_rs(
            int32_t   state_size,
            int32_t   n_seqs,
        const llm_graph_get_rows_fn & get_state_rows) const {
-    const auto * kv_state = inp->mctx;
+    const auto * kv_state = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+        llm_graph_input_mem_hybrid * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * s,
+            int32_t   state_size,
+            int32_t   n_seqs,
+        const llm_graph_get_rows_fn & get_state_rows) const {
+    const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();

    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
 }
@@ -1502,17 +1592,6 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
    );
 }

-llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
-    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
-
-    auto inp_rs   = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
-    auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
-
-    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
-
-    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
-}
-
 void llm_graph_context::build_pooling(
        ggml_cgraph * gf,
        ggml_tensor * cls,
@@ -322,25 +322,47 @@ public:
 class llm_graph_input_mem_hybrid : public llm_graph_input_i {
 public:
    llm_graph_input_mem_hybrid(
-            std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
-            std::unique_ptr<llm_graph_input_rs>              inp_rs,
-            const llama_memory_hybrid_context *              mctx) :
-        inp_attn(std::move(inp_attn)),
-        inp_rs(std::move(inp_rs)),
-        mctx(mctx) { }
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_memory_hybrid_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
    virtual ~llm_graph_input_mem_hybrid() = default;

    void set_input(const llama_ubatch * ubatch) override;

-    std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
-    std::unique_ptr<llm_graph_input_rs>              inp_rs;
+    ggml_tensor * s_copy; // I32 [kv_size]

-    llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
-    llm_graph_input_rs              * get_recr() const { return inp_rs.get(); }
+    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+    ggml_tensor * get_v_idxs() const { return self_v_idxs; }
+
+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+    ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch, 1, 1]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch, 1, 1]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;

    const llama_memory_hybrid_context * mctx;
 };

+// TODO: remove this when ggml_scale_add is implemented
+class llm_graph_input_one : public llm_graph_input_i {
+public:
+    llm_graph_input_one() {}
+    virtual ~llm_graph_input_one() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * one = nullptr; // F32
+};
+
 //
 // llm_graph_result
 //
@@ -557,6 +579,8 @@ struct llm_graph_context {
    ggml_tensor * build_inp_pos_bucket_dec() const;
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;

+    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+
    //
    // attention
    //
@@ -632,6 +656,18 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    ggml_tensor * build_attn(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
    //
    // recurrent
    //
@@ -664,6 +700,14 @@ struct llm_graph_context {
                int32_t   n_seqs,
            const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;

+    ggml_tensor * build_rs(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+            const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+
    ggml_tensor * build_rwkv_token_shift_load(
        llm_graph_input_rs * inp,
               ggml_cgraph * gf,
@@ -674,11 +718,6 @@ struct llm_graph_context {
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
                     int   il) const;
-    //
-    // hybrid
-    //
-
-    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;

    //
    // pooling
@@ -71,11 +71,6 @@ uint32_t llama_hparams::n_embd_r() const {
        return token_shift_count * n_embd;
    }

-    if (n_shortconv_l_cache != 0) {
-        // for LFM2 models
-        return n_embd * (n_shortconv_l_cache - 1);
-    }
-
    // TODO: maybe support other convolution strides than 1
    // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
    // Corresponds to Mamba's conv_states size
@@ -55,8 +55,6 @@ struct llama_hparams {
    struct llama_hparams_posnet   posnet;
    struct llama_hparams_convnext convnext;

-    uint32_t n_shortconv_l_cache  = 0;
-
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -25,6 +25,9 @@ llama_memory_recurrent::llama_memory_recurrent(
                 uint32_t    n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
    const int32_t n_layer = hparams.n_layer;

+    LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n",
+            __func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer);
+
    head = 0;
    size = mem_size;
    used = 0;
@@ -81,7 +84,7 @@ llama_memory_recurrent::llama_memory_recurrent(

        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
-            throw std::runtime_error("failed to create ggml context for rs cache");
+            throw std::runtime_error("failed to create ggml context for kv cache");
        }

        ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
@@ -99,10 +102,10 @@ llama_memory_recurrent::llama_memory_recurrent(

        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for rs cache");
+            throw std::runtime_error("failed to allocate buffer for kv cache");
        }
        ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
        bufs.emplace_back(buf);
    }

@@ -110,8 +113,8 @@ llama_memory_recurrent::llama_memory_recurrent(
        const size_t memory_size_r = size_r_bytes();
        const size_t memory_size_s = size_s_bytes();

-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
+        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f),
                ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
                ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
    }
@@ -32,21 +32,17 @@ enum llm_type {
    LLM_TYPE_190M,
    LLM_TYPE_220M,
    LLM_TYPE_250M,
-    LLM_TYPE_256M,
    LLM_TYPE_270M,
    LLM_TYPE_335M,
-    LLM_TYPE_350M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
-    LLM_TYPE_700M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
    LLM_TYPE_0_3B,
    LLM_TYPE_0_5B,
    LLM_TYPE_0_6B,
    LLM_TYPE_1B,
-    LLM_TYPE_1_2B,
    LLM_TYPE_1_3B,
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
@@ -98,7 +94,6 @@ enum llm_type {
    LLM_TYPE_57B_A14B,
    LLM_TYPE_17B_16E, // llama4 Scout
    LLM_TYPE_17B_128E, // llama4 Maverick
-    LLM_TYPE_A13B,
    LLM_TYPE_30B_A3B,
    LLM_TYPE_235B_A22B,
    LLM_TYPE_E2B,
@@ -158,12 +153,6 @@ struct llama_layer_convnext {
    struct ggml_tensor * gamma = nullptr;
 };

-struct llama_layer_shortconv {
-    struct ggml_tensor * in_proj  = nullptr;
-    struct ggml_tensor * conv     = nullptr;
-    struct ggml_tensor * out_proj = nullptr;
-};
-
 struct llama_layer {
    // normalization
    struct ggml_tensor * attn_norm       = nullptr;
@@ -184,9 +173,6 @@ struct llama_layer {
    struct ggml_tensor * attn_norm_cross = nullptr;
    struct ggml_tensor * attn_norm_enc   = nullptr;
    struct ggml_tensor * ssm_norm        = nullptr;
-    struct ggml_tensor * ssm_dt_norm     = nullptr;
-    struct ggml_tensor * ssm_b_norm      = nullptr;
-    struct ggml_tensor * ssm_c_norm      = nullptr;

    // attention
    struct ggml_tensor * wq        = nullptr;
@@ -350,8 +336,6 @@ struct llama_layer {
    struct llama_layer_posnet posnet;

    struct llama_layer_convnext convnext;
-
-    struct llama_layer_shortconv shortconv;
 };

 struct llama_model {
@@ -844,7 +844,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize Mamba's small yet 2D weights
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;

        // do not quantize RWKV's small yet 2D weights
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
@@ -884,7 +883,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                            if  (qtype != new_type) {
                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
-                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                            }
                        }
                    }
@@ -351,7 +351,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                break;
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
-            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1523,10 +1522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "llama-v3" ||
                    tokenizer_pre == "llama-bpe"||
                    tokenizer_pre == "falcon3"  ||
-                    tokenizer_pre == "falcon-h1" ||
-                    tokenizer_pre == "pixtral"  ||
-                    tokenizer_pre == "midm-2.0" ||
-                    tokenizer_pre == "lfm2") {
+                    tokenizer_pre == "pixtral") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                ignore_merges = true;
                add_bos = true;
@@ -1558,8 +1554,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-de" ||
                    tokenizer_pre == "gigachat"   ||
                    tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "jina-v2-de") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@@ -1661,10 +1656,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "seed-coder") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "hunyuan") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
-                clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -1848,7 +1839,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<EOT>"
                        || t.first == "_<EOT>"
                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
-                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2008,7 +1998,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<EOT>"
                    || t.first == "_<EOT>"
                    || t.first == "<|end_of_text|>"
-                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -6,47 +6,6 @@
 #include <vector>
 #include <memory>

-// pre-tokenization types
-enum llama_vocab_pre_type {
-    LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-    LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-    LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-    LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-    LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-    LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-    LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-    LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
-    LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
-    LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
-    LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
-    LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-    LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-    LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-    LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-    LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-    LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
-    LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
-    LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
-    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
-    LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
-    LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-    LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-    LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-    LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
-    LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
-    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
-    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
-    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
-};
-
 struct LLM_KV;
 struct llama_model_loader;

@@ -317,11 +317,10 @@ enum test_mode {
    MODE_TEST,
    MODE_PERF,
    MODE_GRAD,
-    MODE_SUPPORT,
 };

 // Output format support similar to llama-bench
-enum output_formats { CONSOLE, SQL, CSV };
+enum output_formats { CONSOLE, SQL };

 static const char * output_format_str(output_formats format) {
    switch (format) {
@@ -329,8 +328,6 @@ static const char * output_format_str(output_formats format) {
            return "console";
        case SQL:
            return "sql";
-        case CSV:
-            return "csv";
        default:
            GGML_ABORT("invalid output format");
    }
@@ -341,8 +338,6 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
        format = CONSOLE;
    } else if (s == "sql") {
        format = SQL;
-    } else if (s == "csv") {
-        format = CSV;
    } else {
        return false;
    }
@@ -365,8 +360,6 @@ struct test_result {
    double      bandwidth_gb_s;
    size_t      memory_kb;
    int         n_runs;
-    std::string device_description;
-    std::string backend_reg_name;

    test_result() {
        // Initialize with default values
@@ -391,7 +384,7 @@ struct test_result {
    test_result(const std::string & backend_name, const std::string & op_name, const std::string & op_params,
                const std::string & test_mode, bool supported, bool passed, const std::string & error_message = "",
                double time_us = 0.0, double flops = 0.0, double bandwidth_gb_s = 0.0, size_t memory_kb = 0,
-                int n_runs = 0, const std::string & device_description = "", const std::string & backend_reg_name = "") :
+                int n_runs = 0) :
        backend_name(backend_name),
        op_name(op_name),
        op_params(op_params),
@@ -403,9 +396,7 @@ struct test_result {
        flops(flops),
        bandwidth_gb_s(bandwidth_gb_s),
        memory_kb(memory_kb),
-        n_runs(n_runs),
-        device_description(device_description),
-        backend_reg_name(backend_reg_name) {
+        n_runs(n_runs) {
        // Set test time
        time_t t = time(NULL);
        char   buf[32];
@@ -419,8 +410,7 @@ struct test_result {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "test_time", "build_commit",  "backend_name", "op_name", "op_params",      "test_mode", "supported",
-            "passed",    "error_message", "time_us",      "flops",   "bandwidth_gb_s", "memory_kb", "n_runs",
-            "device_description", "backend_reg_name"
+            "passed",    "error_message", "time_us",      "flops",   "bandwidth_gb_s", "memory_kb", "n_runs"
        };
        return fields;
    }
@@ -454,9 +444,7 @@ struct test_result {
                 std::to_string(flops),
                 std::to_string(bandwidth_gb_s),
                 std::to_string(memory_kb),
-                 std::to_string(n_runs),
-                 device_description,
-                 backend_reg_name };
+                 std::to_string(n_runs) };
    }
 };

@@ -645,8 +633,6 @@ struct console_printer : public printer {
            print_test_console(result);
        } else if (result.test_mode == "perf") {
            print_perf_console(result);
-        } else if (result.test_mode == "support") {
-            print_support_console(result);
        }
    }

@@ -813,17 +799,6 @@ struct console_printer : public printer {
        }
        printf("\n");
    }
-
-    void print_support_console(const test_result & result) {
-        printf("  %s(%s): ", result.op_name.c_str(), result.op_params.c_str());
-        fflush(stdout);
-
-        if (result.supported) {
-            printf("\033[1;32mSUPPORTED\033[0m\n");
-        } else {
-            printf("\033[1;31mNOT SUPPORTED\033[0m\n");
-        }
-    }
 };

 struct sql_printer : public printer {
@@ -866,39 +841,12 @@ struct sql_printer : public printer {
    }
 };

-struct csv_printer : public printer {
-    void print_header() override {
-        std::vector<std::string> fields = test_result::get_fields();
-        for (size_t i = 0; i < fields.size(); i++) {
-            printf("\"%s\"%s", fields[i].c_str(), i < fields.size() - 1 ? "," : "");
-        }
-        printf("\n");
-    }
-
-    void print_test_result(const test_result & result) override {
-        std::vector<std::string> values = result.get_values();
-        for (size_t i = 0; i < values.size(); i++) {
-            // Escape quotes and wrap in quotes for CSV
-            std::string escaped_value = values[i];
-            size_t pos = 0;
-            while ((pos = escaped_value.find("\"", pos)) != std::string::npos) {
-                escaped_value.replace(pos, 1, "\"\"");
-                pos += 2;
-            }
-            printf("\"%s\"%s", escaped_value.c_str(), i < values.size() - 1 ? "," : "");
-        }
-        printf("\n");
-    }
-};
-
 static std::unique_ptr<printer> create_printer(output_formats format) {
    switch (format) {
        case CONSOLE:
            return std::make_unique<console_printer>();
        case SQL:
            return std::make_unique<sql_printer>();
-        case CSV:
-            return std::make_unique<csv_printer>();
    }
    GGML_ABORT("invalid output format");
 }
@@ -980,7 +928,7 @@ struct test_case {
    std::vector<ggml_tensor *> sentinels;

    void add_sentinel(ggml_context * ctx) {
-        if (mode == MODE_PERF || mode == MODE_GRAD || mode == MODE_SUPPORT) {
+        if (mode == MODE_PERF || mode == MODE_GRAD) {
            return;
        }
        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
@@ -1205,12 +1153,15 @@ struct test_case {
            return true;
        }

+        // check if backends support op
        if (!ggml_backend_supports_op(backend, out)) {
            // Create test result for unsupported performance test
            test_result result(ggml_backend_name(backend), current_op_name, vars(), "perf", false, false,
                               "not supported");

-            output_printer->print_test_result(result);
+            if (output_printer) {
+                output_printer->print_test_result(result);
+            }

            return true;
        }
@@ -1315,38 +1266,6 @@ struct test_case {
        return true;
    }

-    bool eval_support(ggml_backend_t backend, const char * op_name, printer * output_printer) {
-        mode = MODE_SUPPORT;
-
-        static const size_t graph_nodes = 8192;
-
-        ggml_init_params params = {
-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
-            /* .mem_base = */ NULL,
-            /* .no_alloc = */ true,
-        };
-        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
-        GGML_ASSERT(ctx);
-
-        ggml_tensor * out             = build_graph(ctx.get());
-        std::string   current_op_name = op_desc(out);
-        if (op_name != nullptr && current_op_name != op_name) {
-            return true;
-        }
-
-        bool supported = ggml_backend_supports_op(backend, out);
-
-        std::string device_desc = ggml_backend_dev_description(ggml_backend_get_device(backend));
-        std::string backend_reg_name = ggml_backend_reg_name(ggml_backend_dev_backend_reg(ggml_backend_get_device(backend)));
-
-        test_result result(ggml_backend_name(backend), current_op_name, vars(), "support", supported, supported,
-                           supported ? "yes" : "no", 0.0, 0.0, 0.0, 0, 0, device_desc, backend_reg_name);
-
-        output_printer->print_test_result(result);
-
-        return true;
-    }
-
    bool eval_grad(ggml_backend_t backend, const char * op_name, printer * output_printer) {
        mode = MODE_GRAD;
        const std::vector<float> expect = grad_expect();
@@ -2449,24 +2368,22 @@ struct test_scale : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    float scale;
-    float bias;

    std::string vars() override {
-        return VARS_TO_STR4(type, ne, scale, bias);
+        return VARS_TO_STR3(type, ne, scale);
    }

    test_scale(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float scale = 2.0f,
-            float bias = 0.0f)
-        : type(type), ne(ne), scale(scale), bias(bias) {}
+            float scale = 2.0f)
+        : type(type), ne(ne), scale(scale) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_param(a);
        ggml_set_name(a, "a");

-        ggml_tensor * out = ggml_scale_bias(ctx, a, scale, bias);
+        ggml_tensor * out = ggml_scale(ctx, a, scale);
        ggml_set_name(out, "out");

        return out;
@@ -4114,32 +4031,6 @@ struct test_pad_reflect_1d : public test_case {
    }
 };

-// GGML_OP_ROLL
-struct test_roll : public test_case {
-    const int shift0;
-    const int shift1;
-    const int shift3;
-    const int shift4;
-
-    std::string vars() override {
-        return VARS_TO_STR4(shift0, shift1, shift3, shift4);
-    }
-
-    test_roll(int shift0 = 3, int shift1 = -2, int shift3 = 1, int shift4 = -1)
-        : shift0(shift0), shift1(shift1), shift3(shift3), shift4(shift4) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        int64_t ne[4] = {10, 5, 4, 3};
-        ggml_tensor * a = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-        ggml_set_name(a, "a");
-
-        ggml_tensor * out = ggml_roll(ctx, a, shift0, shift1, shift3, shift4);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-};
-
 // GGML_OP_ARANGE
 struct test_arange : public test_case {
    const ggml_type type;
@@ -5153,7 +5044,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

    test_cases.emplace_back(new test_add1());
    test_cases.emplace_back(new test_scale());
-    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
    test_cases.emplace_back(new test_silu_back());

    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
@@ -5170,17 +5060,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

    test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));

-    for (int64_t d_conv : {3, 4}) {
-        for (int64_t d_inner: {1024, 1536, 2048}) {
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
-        }
-    }
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));

    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 16, 2, 32, 4)); // Mamba-2
-    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 256, 64,  8, 2, 32, 4)); // Falcon-H1

    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
@@ -5438,12 +5323,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    for (bool fw : {true, false}) { // fw == forward
        bool all = true;

-        for (float fs : { 1.0f, 1.4245f }) {
-            for (float ef : { 0.0f, 0.7465f }) {
-                for (float af : { 1.0f, 1.4245f }) {
-                    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-                        for (bool ff : {false, true}) { // freq_factors
-                            for (float v : { 0, 1 }) {
+        for (float v : { 0, 1 }) {
+            for (float fs : { 1.0f, 1.4245f }) {
+                for (float ef : { 0.0f, 0.7465f }) {
+                    for (float af : { 1.0f, 1.4245f }) {
+                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                            for (bool ff : {false, true}) { // freq_factors
                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B

                                if (all) {
@@ -5456,21 +5341,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
-
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 0, 512, fs, ef, af, ff, v, fw));
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 0, 512, fs, ef, af, ff, v, fw));
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 4, 1},  32, 0, 512, fs, ef, af, ff, v, fw));
-
                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 4, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
                                }

                                if (all) {
                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
-                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
-                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
                                }

@@ -5514,7 +5391,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_acc());
    test_cases.emplace_back(new test_pad());
    test_cases.emplace_back(new test_pad_reflect_1d());
-    test_cases.emplace_back(new test_roll());
    test_cases.emplace_back(new test_arange());
    test_cases.emplace_back(new test_timestep_embedding());
    test_cases.emplace_back(new test_leaky_relu());
@@ -5711,27 +5587,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        return true;
    }

-    if (mode == MODE_SUPPORT) {
-        auto test_cases = make_test_cases_eval();
-        filter_test_cases(test_cases, params_filter);
-        for (auto & test : test_cases) {
-            test->eval_support(backend, op_name, output_printer);
-        }
-        return true;
-    }
-
    GGML_ABORT("fatal error");
 }

 static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o <op>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>]\n", argv[0]);
+    printf("Usage: %s [mode] [-o <op>] [-b <backend>] [-p <params regex>] [--output <console|sql>]\n", argv[0]);
    printf("    valid modes:\n");
    printf("      - test (default, compare with CPU backend for correctness)\n");
    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
    printf("      - perf (performance evaluation)\n");
-    printf("      - support (probe backend operation support)\n");
    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
-    printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
+    printf("    --output specifies output format (default: console)\n");
 }

 int main(int argc, char ** argv) {
@@ -5748,8 +5614,6 @@ int main(int argc, char ** argv) {
            mode = MODE_PERF;
        } else if (strcmp(argv[i], "grad") == 0) {
            mode = MODE_GRAD;
-        } else if (strcmp(argv[i], "support") == 0) {
-            mode = MODE_SUPPORT;
        } else if (strcmp(argv[i], "-o") == 0) {
            if (i + 1 < argc) {
                op_name_filter = argv[++i];
@@ -7,7 +7,8 @@ if (LLAMA_CURL)
    find_package(CURL REQUIRED)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
-    set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARIES})
+    find_library(CURL_LIBRARY curl REQUIRED)
+    set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

 install(TARGETS ${TARGET} RUNTIME)
@@ -2581,14 +2581,12 @@ struct server_context {
                continue;
            }

-            const float * embd = nullptr;
-            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
                embd = llama_get_embeddings_ith(ctx, i);
-            } else {
-                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
            }

-            if (embd == nullptr) {
+            if (embd == NULL) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);

                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
@@ -2596,12 +2594,12 @@ struct server_context {
            }

            // normalize only when there is pooling
+            // TODO: configurable
            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
                res->embedding.push_back(embd_res);
-                break;
            } else {
-                res->embedding.emplace_back(embd, embd + n_embd);
+                res->embedding.push_back({ embd, embd + n_embd });
            }
        }

@@ -4808,14 +4806,14 @@ int main(int argc, char ** argv) {
        // register static assets routes
        if (!params.public_path.empty()) {
            // Set the base directory for serving static files
-            bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path);
+            bool is_found = svr->set_mount_point("/", params.public_path);
            if (!is_found) {
                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
                return 1;
            }
        } else {
            // using embedded static index.html
-            svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
+            svr->Get("/", [](const httplib::Request & req, httplib::Response & res) {
                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
                } else {
@@ -4831,37 +4829,37 @@ int main(int argc, char ** argv) {
    }

    // register API routes
-    svr->Get (params.api_prefix + "/health",              handle_health); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/metrics",             handle_metrics);
-    svr->Get (params.api_prefix + "/props",               handle_props);
-    svr->Post(params.api_prefix + "/props",               handle_props_change);
-    svr->Post(params.api_prefix + "/api/show",            handle_api_show);
-    svr->Get (params.api_prefix + "/models",              handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/v1/models",           handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/api/tags",            handle_models); // ollama specific endpoint. public endpoint (no API key check)
-    svr->Post(params.api_prefix + "/completion",          handle_completions); // legacy
-    svr->Post(params.api_prefix + "/completions",         handle_completions);
-    svr->Post(params.api_prefix + "/v1/completions",      handle_completions_oai);
-    svr->Post(params.api_prefix + "/chat/completions",    handle_chat_completions);
-    svr->Post(params.api_prefix + "/v1/chat/completions", handle_chat_completions);
-    svr->Post(params.api_prefix + "/api/chat",            handle_chat_completions); // ollama specific endpoint
-    svr->Post(params.api_prefix + "/infill",              handle_infill);
-    svr->Post(params.api_prefix + "/embedding",           handle_embeddings); // legacy
-    svr->Post(params.api_prefix + "/embeddings",          handle_embeddings);
-    svr->Post(params.api_prefix + "/v1/embeddings",       handle_embeddings_oai);
-    svr->Post(params.api_prefix + "/rerank",              handle_rerank);
-    svr->Post(params.api_prefix + "/reranking",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/rerank",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/reranking",        handle_rerank);
-    svr->Post(params.api_prefix + "/tokenize",            handle_tokenize);
-    svr->Post(params.api_prefix + "/detokenize",          handle_detokenize);
-    svr->Post(params.api_prefix + "/apply-template",      handle_apply_template);
+    svr->Get ("/health",              handle_health); // public endpoint (no API key check)
+    svr->Get ("/metrics",             handle_metrics);
+    svr->Get ("/props",               handle_props);
+    svr->Post("/props",               handle_props_change);
+    svr->Post("/api/show",            handle_api_show);
+    svr->Get ("/models",              handle_models); // public endpoint (no API key check)
+    svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
+    svr->Get ("/api/tags",            handle_models); // ollama specific endpoint. public endpoint (no API key check)
+    svr->Post("/completion",          handle_completions); // legacy
+    svr->Post("/completions",         handle_completions);
+    svr->Post("/v1/completions",      handle_completions_oai);
+    svr->Post("/chat/completions",    handle_chat_completions);
+    svr->Post("/v1/chat/completions", handle_chat_completions);
+    svr->Post("/api/chat",            handle_chat_completions); // ollama specific endpoint
+    svr->Post("/infill",              handle_infill);
+    svr->Post("/embedding",           handle_embeddings); // legacy
+    svr->Post("/embeddings",          handle_embeddings);
+    svr->Post("/v1/embeddings",       handle_embeddings_oai);
+    svr->Post("/rerank",              handle_rerank);
+    svr->Post("/reranking",           handle_rerank);
+    svr->Post("/v1/rerank",           handle_rerank);
+    svr->Post("/v1/reranking",        handle_rerank);
+    svr->Post("/tokenize",            handle_tokenize);
+    svr->Post("/detokenize",          handle_detokenize);
+    svr->Post("/apply-template",      handle_apply_template);
    // LoRA adapters hotswap
-    svr->Get (params.api_prefix + "/lora-adapters",       handle_lora_adapters_list);
-    svr->Post(params.api_prefix + "/lora-adapters",       handle_lora_adapters_apply);
+    svr->Get ("/lora-adapters",       handle_lora_adapters_list);
+    svr->Post("/lora-adapters",       handle_lora_adapters_apply);
    // Save & load slots
-    svr->Get (params.api_prefix + "/slots",               handle_slots);
-    svr->Post(params.api_prefix + "/slots/:id_slot",      handle_slots_action);
+    svr->Get ("/slots",               handle_slots);
+    svr->Post("/slots/:id_slot",      handle_slots_action);

    //
    // Start the server
@@ -11,8 +11,6 @@

 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
-// increase backlog size to avoid connection resets for >> 1 slots
-#define CPPHTTPLIB_LISTEN_BACKLOG 512
 // disable Nagle's algorithm
 #define CPPHTTPLIB_TCP_NODELAY true
 #include <cpp-httplib/httplib.h>