server : headers cleanup

move enum stop_type to server-task
rm redundant includes
2026-07-03 02:57:42 +02:00 · 2025-11-24 12:50:50 +02:00 · 2025-11-18 17:28:21 +01:00 · 2025-11-18 15:01:17 +01:00 · 2025-11-18 14:24:46 +01:00 · 2025-11-18 14:15:14 +01:00
65 changed files with 3599 additions and 7354 deletions
@@ -1390,10 +1390,14 @@ jobs:
    strategy:
      matrix:
        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
+        cann:
+          - '8.3.rc1.alpha001-910b-openeuler22.03-py3.11'
+        chip_type:
+          - '910b'
+        build:
+          - 'Release'
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
+    container: ascendai/cann:${{ matrix.cann }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -698,9 +698,10 @@ jobs:
      matrix:
        arch: [x86, aarch64]
        chip_type: ['910b', '310p']
-        build: ['Release']
+        build:
+          - 'Release'
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
+    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.3.rc1.alpha001-310p-openeuler22.03-py3.11' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -736,7 +737,7 @@ jobs:
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+          name: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -20,40 +20,52 @@
 *.so
 *.swp
 *.tmp
-*.DS_Store

 # IDE / OS

-/.cache/
-/.ccls-cache/
-/.direnv/
-/.envrc
-/.idea/
-/.swiftpm
-/.vs/
-/.vscode/
-/nppBackup
+.cache/
+.ccls-cache/
+.direnv/
+.DS_Store
+.envrc
+.idea/
+.swiftpm
+.vs/
+.vscode/
+nppBackup


 # Coverage

-/gcovr-report/
-/lcov-report/
+gcovr-report/
+lcov-report/

 # Build Artifacts

-/tags
-/.build/
-/build*
-/release
-/debug
+tags
+.build/
+build*
+release
+debug
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
+!build.zig
+!docs/build.md
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
+android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
 /rpc-server
-/out/
-/tmp/
-/autogen-*.md
+out/
+tmp/
+autogen-*.md

 # Deprecated

@@ -62,38 +74,44 @@

 # CI

-!/.github/workflows/*.yml
+!.github/workflows/*.yml

 # Models

-/models/*
-/models-mnt
-!/models/.editorconfig
-!/models/ggml-vocab-*.gguf*
-!/models/templates
+models/*
+models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
+!models/templates

 # Zig
-/zig-out/
-/zig-cache/
+zig-out/
+zig-cache/
+
+# Logs
+
+ppl-*.txt
+qnt-*.txt
+perf-*.txt

 # Examples

-/examples/jeopardy/results.txt
-/tools/server/*.css.hpp
-/tools/server/*.html.hpp
-/tools/server/*.js.hpp
-/tools/server/*.mjs.hpp
-/tools/server/*.gz.hpp
-!/build_64.sh
-!/examples/*.bat
-!/examples/*/*.kts
-!/examples/*/*/*.kts
-!/examples/sycl/*.bat
-!/examples/sycl/*.sh
+examples/jeopardy/results.txt
+tools/server/*.css.hpp
+tools/server/*.html.hpp
+tools/server/*.js.hpp
+tools/server/*.mjs.hpp
+tools/server/*.gz.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh

 # Server Web UI temporary files
-/tools/server/webui/node_modules
-/tools/server/webui/dist
+node_modules
+tools/server/webui/dist

 # Python

@@ -129,8 +147,8 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
-/.ccache/
+.ccache/

 # IDE
-/*.code-workspace
-/.windsurf/
+*.code-workspace
+.windsurf/
@@ -50,8 +50,6 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat-parser.cpp
    chat-parser.h
-    chat-parser-xml-toolcall.h
-    chat-parser-xml-toolcall.cpp
    chat.cpp
    chat.h
    common.cpp
@@ -1,861 +0,0 @@
-#include "chat.h"
-#include "chat-parser.h"
-#include "common.h"
-#include "json-partial.h"
-#include "json-schema-to-grammar.h"
-#include "log.h"
-#include "regex-partial.h"
-
-using json = nlohmann::ordered_json;
-
-class xml_toolcall_syntax_exception : public std::runtime_error {
-  public:
-    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
-};
-
-template<typename T>
-inline void sort_uniq(std::vector<T> &vec) {
-    std::sort(vec.begin(), vec.end());
-    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
-}
-
-template<typename T>
-inline bool all_space(const T &str) {
-    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
-}
-
-static size_t utf8_truncate_safe(const std::string_view s) {
-    size_t len = s.size();
-    if (len == 0) return 0;
-    size_t i = len;
-    for (size_t back = 0; back < 4 && i > 0; ++back) {
-        --i;
-        unsigned char c = s[i];
-        if ((c & 0x80) == 0) {
-            return len;
-        } else if ((c & 0xC0) == 0xC0) {
-            size_t expected_len = 0;
-            if ((c & 0xE0) == 0xC0) expected_len = 2;
-            else if ((c & 0xF0) == 0xE0) expected_len = 3;
-            else if ((c & 0xF8) == 0xF0) expected_len = 4;
-            else return i;
-            if (len - i >= expected_len) {
-                return len;
-            } else {
-                return i;
-            }
-        }
-    }
-    return len - std::min(len, size_t(3));
-}
-
-inline void utf8_truncate_safe_resize(std::string &s) {
-    s.resize(utf8_truncate_safe(s));
-}
-
-inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
-    return s.substr(0, utf8_truncate_safe(s));
-}
-
-static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
-    if (literal1.size() == 0) return builder.try_find_literal(literal2);
-    const auto saved_pos = builder.pos();
-    while (auto res = builder.try_find_literal(literal1)) {
-        builder.consume_spaces();
-        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
-        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
-            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
-                res->prelude = builder.str({saved_pos, res->groups[0].begin});
-            }
-            builder.move_to(builder.pos() + match_len);
-            res->groups[0].end = builder.pos();
-            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
-            return res;
-        }
-        builder.move_to(res->groups[0].begin + 1);
-    }
-    builder.move_to(saved_pos);
-    return std::nullopt;
-}
-
-/**
- * make a GBNF that accept any strings except those containing any of the forbidden strings.
- */
-std::string make_gbnf_excluding(std::vector<std::string> forbids) {
-    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
-        if (c == '\\' || c == ']' || c == '^' || c == '-') {
-            std::string s = "\\";
-            s.push_back((char)c);
-            return s;
-        }
-        if (isprint(c)) {
-            return std::string(1, (char)c);
-        }
-        char buf[16];
-        snprintf(buf, 15, "\\x%02X", c);
-        return std::string(buf);
-    };
-    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
-        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
-        int i = l;
-        while (i < r) {
-            const std::string &s = forbids[i];
-            if ((int)s.size() == depth) {
-                ++i;
-                continue;
-            }
-            unsigned char c = (unsigned char)s[depth];
-            int j = i;
-            while (j < r && (int)forbids[j].size() > depth &&
-                   (unsigned char)forbids[j][depth] == c) {
-                ++j;
-            }
-            children.push_back({c, {i, j}});
-            i = j;
-        }
-        std::vector<std::string> alts;
-        if (!children.empty()) {
-            std::string cls;
-            for (auto &ch : children) cls += charclass_escape(ch.first);
-            alts.push_back(std::string("[^") + cls + "]");
-        }
-        for (auto &ch : children) {
-            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
-            if (!childExpr.empty()) {
-                std::string quoted_ch = "\"";
-                if (ch.first == '\\') quoted_ch += "\\\\";
-                else if (ch.first == '"') quoted_ch += "\\\"";
-                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
-                else {
-                    char buf[16];
-                    snprintf(buf, 15, "\\x%02X", ch.first);
-                    quoted_ch += buf;
-                }
-                quoted_ch += "\"";
-                std::string branch = quoted_ch + std::string(" ") + childExpr;
-                alts.push_back(branch);
-            }
-        }
-        if (alts.empty()) return "";
-        std::ostringstream oss;
-        oss << "( ";
-        for (size_t k = 0; k < alts.size(); ++k) {
-            if (k) oss << " | ";
-            oss << alts[k];
-        }
-        oss << " )";
-        return oss.str();
-    };
-    if (forbids.empty()) return "( . )*";
-    sort(forbids.begin(), forbids.end());
-    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
-    if (expr.empty()) {
-        std::string cls;
-        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
-        expr = std::string("( [^") + cls + "] )";
-    }
-    if (forbids.size() == 1)
-        return expr + "*";
-    else
-        return std::string("( ") + expr + " )*";
-}
-
-/**
- * Build grammar for xml-style tool call
- * form.scope_start and form.scope_end can be empty.
- * Requires data.format for model-specific hacks.
- */
-void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
-    GGML_ASSERT(!form.tool_start.empty());
-    GGML_ASSERT(!form.tool_sep.empty());
-    GGML_ASSERT(!form.key_start.empty());
-    GGML_ASSERT(!form.val_end.empty());
-    GGML_ASSERT(!form.tool_end.empty());
-
-    std::string key_val_sep = form.key_val_sep;
-    if (form.key_val_sep2) {
-        key_val_sep += "\n";
-        key_val_sep += *form.key_val_sep2;
-    }
-    GGML_ASSERT(!key_val_sep.empty());
-
-    if (tools.is_array() && !tools.empty()) {
-        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
-            auto string_arg_val = form.last_val_end ?
-                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
-                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
-
-            std::vector<std::string> tool_rules;
-            for (const auto & tool : tools) {
-                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
-                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
-                    continue;
-                }
-                const auto & function = tool.at("function");
-                if (!function.contains("name") || !function.at("name").is_string()) {
-                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
-                    continue;
-                }
-                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
-                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
-                    continue;
-                }
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-
-                struct parameter_rule {
-                    std::string symbol_name;
-                    bool is_required;
-                };
-                std::vector<parameter_rule> arg_rules;
-                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
-                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
-                    continue;
-                } else {
-                    std::vector<std::string> requiredParameters;
-                    if (parameters.contains("required")) {
-                        try { parameters.at("required").get_to(requiredParameters); }
-                        catch (const std::runtime_error&) {
-                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
-                        }
-                    }
-                    sort_uniq(requiredParameters);
-                    for (const auto & [key, value] : parameters.at("properties").items()) {
-                        std::string quoted_key = key;
-                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
-                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
-                            quoted_key = gbnf_format_literal(key);
-                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
-                        }
-                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
-                            gbnf_format_literal(form.key_start) + " " +
-                            gbnf_format_literal(quoted_key) + " " +
-                            gbnf_format_literal(key_val_sep) + " " +
-                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
-                                    (form.raw_argval ?
-                                            string_arg_val :
-                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
-                                    ) :
-                                    builder.add_schema(name + "-arg-" + key, value)
-                            )
-                        ), required});
-                    }
-                }
-
-                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
-                decltype(next_arg_with_sep) next_arg = "\"\"";
-                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
-                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
-                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
-                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
-                    );
-                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
-                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
-                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
-                    );
-                }
-
-                std::string quoted_name = name;
-                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
-                    quoted_name = gbnf_format_literal(name);
-                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
-                }
-                quoted_name = gbnf_format_literal(quoted_name);
-                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
-                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
-                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
-                }
-                tool_rules.push_back(builder.add_rule(name + "-call",
-                        gbnf_format_literal(form.tool_start) + " " +
-                        quoted_name + " " +
-                        gbnf_format_literal(form.tool_sep) + " " +
-                        next_arg
-                ));
-            }
-
-            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
-            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
-            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
-            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
-            builder.add_rule("root",
-                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
-                tool_call_multiple_with_end  + "?" +
-                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
-            );
-        });
-
-        // grammar trigger for tool call
-        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
-    }
-}
-
-/**
- * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
- * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
- * form.scope_start, form.tool_sep and form.scope_end can be empty.
- */
-inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
-    GGML_ASSERT(!form.tool_start.empty());
-    GGML_ASSERT(!form.key_start.empty());
-    GGML_ASSERT(!form.key_val_sep.empty());
-    GGML_ASSERT(!form.val_end.empty());
-    GGML_ASSERT(!form.tool_end.empty());
-
-    // Helper to choose return false or throw error
-    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
-        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
-        if (recovery) {
-            builder.move_to(start_pos);
-            return false;
-        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
-    };
-    // Drop substring from needle to end from a JSON
-    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
-        auto pos = json_str.rfind(needle);
-        if (pos == std::string::npos) {
-            return false;
-        }
-        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
-            unsigned char ch = static_cast<unsigned char>(json_str[i]);
-            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
-                return false;
-            }
-        }
-        if (pos != 0 && json_str[pos - 1] == '"') {
-            --pos;
-        }
-        json_str.resize(pos);
-        return true;
-    };
-    // Helper to generate a partial argument JSON
-    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
-        auto rest = builder.consume_rest();
-        utf8_truncate_safe_resize(rest);
-        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
-        auto tool_str = arguments.dump();
-        if (partial_json(tool_str)) {
-            if (builder.add_tool_call(function_name, "", tool_str)) {
-                return;
-            }
-        }
-        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
-    };
-    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
-    constexpr auto try_find_close = [](
-            common_chat_msg_parser & builder,
-            const std::string & end,
-            const std::optional<std::string> & alt_end,
-            const std::string & end_next,
-            const std::optional<std::string> & alt_end_next
-    ) {
-        auto saved_pos = builder.pos();
-        auto tc = builder.try_find_literal(end);
-        auto val_end_size = end.size();
-        if (alt_end) {
-            auto pos_1 = builder.pos();
-            builder.move_to(saved_pos);
-            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
-            if (alt_end_next) {
-                builder.move_to(saved_pos);
-                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
-                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
-                    tc2 = tc3;
-                }
-            }
-            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
-                tc = tc2;
-                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
-                builder.move_to(tc->groups[0].end);
-                val_end_size = alt_end->size();
-            } else {
-                builder.move_to(pos_1);
-            }
-        }
-        return std::make_pair(val_end_size, tc);
-    };
-    // Helper to find a val_end or last_val_end, returns matched pattern size
-    const auto try_find_val_end = [try_find_close, &builder, &form]() {
-        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
-    };
-    // Helper to find a tool_end or last_tool_end, returns matched pattern size
-    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
-        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
-    };
-
-    bool recovery = true;
-    const auto start_pos = builder.pos();
-    if (!all_space(form.scope_start)) {
-        if (auto tc = builder.try_find_literal(form.scope_start)) {
-            if (all_space(tc->prelude)) {
-                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
-                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
-            } else {
-                builder.move_to(start_pos);
-                return false;
-            }
-        } else return false;
-    }
-    while (auto tc = builder.try_find_literal(form.tool_start)) {
-        if (!all_space(tc->prelude)) {
-            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
-                    gbnf_format_literal(form.tool_start).c_str(),
-                    gbnf_format_literal(tc->prelude).c_str()
-            );
-            builder.move_to(tc->groups[0].begin - tc->prelude.size());
-            break;
-        }
-
-        // Find tool name
-        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
-        if (!func_name) {
-            auto [sz, tc] = try_find_tool_end();
-            func_name = tc;
-        }
-        if (!func_name) {
-            // Partial tool name not supported
-            throw common_chat_msg_partial_exception("incomplete tool_call");
-        }
-        // If the model generate multiple tool call and the first tool call has no argument
-        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
-            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
-            auto [sz, tc] = try_find_tool_end();
-            func_name = tc;
-        }
-
-        // Parse tool name
-        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
-        std::string function_name = string_strip(func_name->prelude);
-        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
-        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
-            if (string_starts_with(function_name, "functions.")) {
-                static const std::regex re(":\\d+$");
-                if (std::regex_search(function_name, re)) {
-                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
-                }
-            }
-        }
-
-        // Argument JSON
-        json arguments = json::object();
-
-        // Helper to generate a partial argument JSON
-        const auto gen_partial_args = [&](auto set_partial_arg) {
-            gen_partial_json(set_partial_arg, arguments, builder, function_name);
-        };
-
-        // Parse all arg_key/arg_value pairs
-        while (auto tc = builder.try_find_literal(form.key_start)) {
-            if (!all_space(tc->prelude)) {
-                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
-                        gbnf_format_literal(form.key_start).c_str(),
-                        gbnf_format_literal(tc->prelude).c_str()
-                );
-                builder.move_to(tc->groups[0].begin - tc->prelude.size());
-                break;
-            }
-            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
-                auto tool_call_arg = arguments.dump();
-                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
-                    tool_call_arg.resize(tool_call_arg.size() - 1);
-                }
-                builder.add_tool_call(function_name, "", tool_call_arg);
-                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
-            }
-
-            // Parse arg_key
-            auto key_res = builder.try_find_literal(form.key_val_sep);
-            if (!key_res) {
-                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
-                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
-            }
-            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
-                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
-                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
-            }
-            auto &key = key_res->prelude;
-            recovery = false;
-
-            // Parse arg_value
-            if (form.key_val_sep2) {
-                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
-                    if (!all_space(tc->prelude)) {
-                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
-                                gbnf_format_literal(tc->prelude).c_str(),
-                                gbnf_format_literal(form.key_val_sep).c_str(),
-                                gbnf_format_literal(*form.key_val_sep2).c_str()
-                        );
-                        return return_error(builder, start_pos, false);
-                    }
-                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
-                    }
-                } else {
-                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
-                }
-            }
-            auto val_start = builder.pos();
-
-            // Test if arg_val is a partial JSON
-            std::optional<common_json> value_json = std::nullopt;
-            if (!form.raw_argval || !*form.raw_argval) {
-                try { value_json = builder.try_consume_json(); }
-                catch (const std::runtime_error&) { builder.move_to(val_start); }
-                // TODO: Delete this when json_partial adds top-level support for null/true/false
-                if (builder.pos() == val_start) {
-                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
-                    builder.consume_spaces();
-                    std::string_view sv = utf8_truncate_safe_view(builder.input());
-                    sv.remove_prefix(builder.pos());
-                    std::string rest = "a";
-                    if (sv.size() < 6) rest = sv;
-                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
-                        value_json = {123, {"123", "123"}};
-                        builder.consume_rest();
-                    } else {
-                        builder.move_to(val_start);
-                    }
-                }
-            }
-
-            // If it is a JSON and followed by </arg_value>, parse as json
-            // cannot support streaming because it may be a plain text starting with JSON
-            if (value_json) {
-                auto json_end = builder.pos();
-                builder.consume_spaces();
-                if (builder.pos() == builder.input().size()) {
-                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
-                        arguments[key] = value_json->json;
-                        auto json_str = arguments.dump();
-                        if (!value_json->healing_marker.json_dump_marker.empty()) {
-                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
-                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
-                        } else {
-                            GGML_ASSERT(json_str.back() == '}');
-                            json_str.resize(json_str.size() - 1);
-                        }
-                        builder.add_tool_call(function_name, "", json_str);
-                    } else {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                    }
-                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
-                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
-                }
-                builder.move_to(json_end);
-                auto [val_end_size, tc] = try_find_val_end();
-                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
-                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
-                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
-                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
-                    } else arguments[key] = value_json->json;
-                } else builder.move_to(val_start);
-            }
-
-            // If not, parse as plain text
-            if (val_start == builder.pos()) {
-                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
-                    auto &value_str = value_plain->prelude;
-                    if (form.trim_raw_argval) value_str = string_strip(value_str);
-                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
-                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
-                        throw common_chat_msg_partial_exception(
-                                "Expected " + gbnf_format_literal(form.val_end) +
-                                " after " + gbnf_format_literal(form.key_val_sep) +
-                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
-                        );
-                    }
-                    arguments[key] = value_str;
-                } else {
-                    if (form.trim_raw_argval) {
-                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
-                    } else {
-                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
-                    }
-                    throw common_chat_msg_partial_exception(
-                            "Expected " + gbnf_format_literal(form.val_end) +
-                            " after " + gbnf_format_literal(form.key_val_sep) +
-                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
-                    );
-                }
-            }
-        }
-
-        // Consume closing tag
-        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
-            if (!all_space(tc->prelude)) {
-                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                        gbnf_format_literal(form.tool_end).c_str(),
-                        gbnf_format_literal(tc->prelude).c_str()
-                );
-                return return_error(builder, start_pos, recovery);
-            }
-            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
-                // Add the parsed tool call
-                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
-                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
-                }
-                recovery = false;
-                continue;
-            }
-        }
-
-        auto tool_call_arg = arguments.dump();
-        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
-            tool_call_arg.resize(tool_call_arg.size() - 1);
-        }
-        builder.add_tool_call(function_name, "", tool_call_arg);
-        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
-    }
-    if (auto tc = builder.try_find_literal(form.scope_end)) {
-        if (!all_space(tc->prelude)) {
-            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                    gbnf_format_literal(form.scope_end).c_str(),
-                    gbnf_format_literal(tc->prelude).c_str()
-            );
-            return return_error(builder, start_pos, recovery);
-        }
-    } else {
-        if (all_space(form.scope_end)) return true;
-        builder.consume_spaces();
-        if (builder.pos() == builder.input().size())
-            throw common_chat_msg_partial_exception("incomplete tool calls");
-        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
-                gbnf_format_literal(form.scope_end).c_str(),
-                gbnf_format_literal(builder.consume_rest()).c_str()
-        );
-        return return_error(builder, start_pos, recovery);
-    }
-
-    return true;
-}
-
-/**
- * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
- * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
- * form.scope_start, form.tool_sep and form.scope_end can be empty.
- */
-bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
-    auto pos = pos_;
-    auto tsize = result_.tool_calls.size();
-    try { return parse_xml_tool_calls(*this, form); }
-    catch (const xml_toolcall_syntax_exception&) {}
-    move_to(pos);
-    result_.tool_calls.resize(tsize);
-    return false;
-}
-
-/**
- * Parse content uses reasoning and XML-Style tool call
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
- */
-inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
-    constexpr auto rstrip = [](std::string &s) {
-        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
-    };
-    // Erase substring from l to r, along with additional spaces nearby
-    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
-        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
-        ++l;
-        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
-        if (l < r) str[l] = '\n';
-        if (l + 1 < r) str[l + 1] = '\n';
-        if (l != 0) l += 2;
-        str.erase(l, r - l);
-        return l;
-    };
-    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
-        auto best_match = content.size();
-        for (auto pattern: list) {
-            if (pattern.size() == 0) continue;
-            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
-                auto match_len = content.size() - match_idx;
-                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
-                    best_match = match_idx;
-                }
-            }
-        }
-        if (content.size() > best_match) {
-            content.erase(best_match);
-        }
-    };
-    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
-        return trim_suffix(content, {
-            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
-            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
-            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
-            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
-            form.scope_end
-        });
-    };
-
-
-    // Trim leading spaces without affecting keyword matching
-    static const common_regex spaces_regex("\\s*");
-    {
-        auto tc = builder.consume_regex(spaces_regex);
-        auto spaces = builder.str(tc.groups[0]);
-        auto s1 = spaces.size();
-        trim_potential_partial_word(spaces);
-        auto s2 = spaces.size();
-        builder.move_to(builder.pos() - (s1 - s2));
-    }
-
-    // Parse content
-    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
-    std::string unclosed_reasoning_content("");
-    for (;;) {
-        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
-        std::string content;
-        std::string tool_call_start;
-
-        if (tc) {
-            content = std::move(tc->prelude);
-            tool_call_start = builder.str(tc->groups[0]);
-            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
-        } else {
-            content = builder.consume_rest();
-            utf8_truncate_safe_resize(content);
-        }
-
-        // Handle unclosed think block
-        if (reasoning_unclosed) {
-            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
-                unclosed_reasoning_content += content;
-                if (form.allow_toolcall_in_think) {
-                    builder.move_to(tc->groups[0].begin);
-                    if (!builder.try_consume_xml_tool_calls(form)) {
-                        unclosed_reasoning_content += tool_call_start;
-                        builder.move_to(tc->groups[0].end);
-                    }
-                } else {
-                    unclosed_reasoning_content += tool_call_start;
-                }
-                continue;
-            } else {
-                reasoning_unclosed = false;
-                std::string reasoning_content;
-                if (pos == std::string::npos) {
-                    reasoning_content = std::move(content);
-                } else {
-                    reasoning_content = content.substr(0, pos);
-                    content.erase(0, pos + end_think.size());
-                }
-                if (builder.pos() == builder.input().size() && all_space(content)) {
-                    rstrip(reasoning_content);
-                    trim_potential_partial_word(reasoning_content);
-                    rstrip(reasoning_content);
-                    if (reasoning_content.empty()) {
-                        rstrip(unclosed_reasoning_content);
-                        trim_potential_partial_word(unclosed_reasoning_content);
-                        rstrip(unclosed_reasoning_content);
-                        if (unclosed_reasoning_content.empty()) continue;
-                    }
-                }
-                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
-                    builder.add_content(start_think);
-                    builder.add_content(unclosed_reasoning_content);
-                    builder.add_content(reasoning_content);
-                    if (builder.pos() != builder.input().size() || !all_space(content))
-                        builder.add_content(end_think);
-                } else {
-                    builder.add_reasoning_content(unclosed_reasoning_content);
-                    builder.add_reasoning_content(reasoning_content);
-                }
-                unclosed_reasoning_content.clear();
-            }
-        }
-
-        // Handle multiple think block
-        bool toolcall_in_think = false;
-        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
-            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
-                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
-                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
-                    builder.add_reasoning_content(reasoning_content);
-                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
-                } else {
-                    think_start = think_end + end_think.size() - 1;
-                }
-            } else {
-                // This <tool_call> start is in thinking block, skip this tool call
-                auto pos = think_start + start_think.size();
-                unclosed_reasoning_content = content.substr(pos) + tool_call_start;
-                reasoning_unclosed = true;
-                content.resize(think_start);
-                toolcall_in_think = true;
-            }
-        }
-
-        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
-            rstrip(content);
-            // Handle unclosed </think> token from content: delete all </think> token
-            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
-                while (pos != std::string::npos) {
-                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
-                    pos = content.rfind(end_think, pos);
-                }
-            }
-            // Strip if needed
-            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
-                content = string_strip(content);
-            }
-        }
-
-        // remove potential partial suffix
-        if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
-            rstrip(content);
-            trim_potential_partial_word(content);
-            rstrip(content);
-        }
-
-        // Add content
-        if (content.size() != 0) {
-            // If there are multiple content blocks
-            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
-                builder.add_content("\n\n");
-            }
-            builder.add_content(content);
-        }
-
-        // This <tool_call> start is in thinking block, skip this tool call
-        if (toolcall_in_think && !form.allow_toolcall_in_think) {
-            continue;
-        }
-
-        // There is no tool call and all content is parsed
-        if (!tc) {
-            GGML_ASSERT(builder.pos() == builder.input().size());
-            GGML_ASSERT(unclosed_reasoning_content.empty());
-            GGML_ASSERT(!reasoning_unclosed);
-            break;
-        }
-
-        builder.move_to(tc->groups[0].begin);
-        if (builder.try_consume_xml_tool_calls(form)) {
-            auto end_of_tool = builder.pos();
-            builder.consume_spaces();
-            if (builder.pos() != builder.input().size()) {
-                builder.move_to(end_of_tool);
-                if (!builder.result().content.empty()) {
-                    builder.add_content("\n\n");
-                }
-            }
-        } else {
-            static const common_regex next_char_regex(".");
-            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
-            rstrip(c);
-            builder.add_content(c);
-        }
-    }
-}
-
-/**
- * Parse content uses reasoning and XML-Style tool call
- * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
- */
-void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
-    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
-}
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "chat.h"
-
-#include <nlohmann/json.hpp>
-
-#include <optional>
-#include <string>
-#include <vector>
-
-
-// Sample config:
-// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
-// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
-struct xml_tool_call_format {
-    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
-    std::string tool_start;  // <invoke name=\"        // <tool_call>
-    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
-    std::string key_start;   // <parameter name=\"     // <arg_key>
-    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
-    std::string val_end;     // </parameter>\n         // </arg_value>\n
-    std::string tool_end;    // </invoke>\n            // </tool_call>\n
-    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
-    // Set this if there can be dynamic spaces inside key_val_sep.
-    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
-    std::optional<std::string> key_val_sep2 = std::nullopt;
-    // Set true if argval should only be raw string. e.g. Hello "world" hi
-    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
-    // Defaults to std::nullopt, both will be allowed.
-    std::optional<bool> raw_argval = std::nullopt;
-    std::optional<std::string> last_val_end = std::nullopt;
-    std::optional<std::string> last_tool_end = std::nullopt;
-    bool trim_raw_argval = false;
-    bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
-};
-
-// make a GBNF that accept any strings except those containing any of the forbidden strings.
-std::string make_gbnf_excluding(std::vector<std::string> forbids);
-
-/**
- * Build grammar for xml-style tool call
- * form.scope_start and form.scope_end can be empty.
- * Requires data.format for model-specific hacks.
- */
-void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
@@ -1,7 +1,6 @@
 #pragma once

 #include "chat.h"
-#include "chat-parser-xml-toolcall.h"
 #include "json-partial.h"
 #include "regex-partial.h"

@@ -120,14 +119,5 @@ class common_chat_msg_parser {
        const std::vector<std::vector<std::string>> & content_paths = {}
    );

-    /**
-     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
-     * form.scope_start, form.tool_sep and form.scope_end can be empty.
-     */
-    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
-
-    // Parse content uses reasoning and XML-Style tool call
-    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
-
    void clear_tools();
 };
@@ -643,12 +643,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
-        case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
-        case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
-        case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
-        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
-        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
-        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -1813,278 +1807,6 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
    }
 }

-
-static common_chat_params common_chat_params_init_minimax_m2(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_MINIMAX_M2;
-
-    // Handle thinking tags based on prompt ending
-    if (string_ends_with(data.prompt, "<think>\n")) {
-        if (!params.enable_thinking) {
-            // Close the thinking tag immediately if thinking is disabled
-            data.prompt += "</think>\n\n";
-        } else {
-            // Mark thinking as forced open (template started with <think>)
-            data.thinking_forced_open = true;
-        }
-    }
-
-    // Preserve MiniMax-M2 special tokens
-    data.preserved_tokens = {
-        "<think>",
-        "</think>",
-        "<minimax:tool_call>",
-        "</minimax:tool_call>",
-    };
-
-    // build grammar for tool call
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "<minimax:tool_call>\n",
-        /* form.tool_start  = */ "<invoke name=\"",
-        /* form.tool_sep    = */ "\">\n",
-        /* form.key_start   = */ "<parameter name=\"",
-        /* form.key_val_sep = */ "\">",
-        /* form.val_end     = */ "</parameter>\n",
-        /* form.tool_end    = */ "</invoke>\n",
-        /* form.scope_end   = */ "</minimax:tool_call>",
-    };
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
-static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "<minimax:tool_call>",
-        /* form.tool_start  = */ "<invoke name=\"",
-        /* form.tool_sep    = */ "\">",
-        /* form.key_start   = */ "<parameter name=\"",
-        /* form.key_val_sep = */ "\">",
-        /* form.val_end     = */ "</parameter>",
-        /* form.tool_end    = */ "</invoke>",
-        /* form.scope_end   = */ "</minimax:tool_call>",
-    };
-    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
-}
-
-static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
-
-    data.preserved_tokens = {
-        "<tool_call>",
-        "</tool_call>",
-        "<function=",
-        "</function>",
-        "<parameter=",
-        "</parameter>",
-    };
-
-    // build grammar for tool call
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "<tool_call>\n",
-        /* form.tool_start  = */ "<function=",
-        /* form.tool_sep    = */ ">\n",
-        /* form.key_start   = */ "<parameter=",
-        /* form.key_val_sep = */ ">\n",
-        /* form.val_end     = */ "\n</parameter>\n",
-        /* form.tool_end    = */ "</function>\n",
-        /* form.scope_end   = */ "</tool_call>",
-    };
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
-static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<tool_call>";
-        form.tool_start  = "<function=";
-        form.tool_sep    = ">";
-        form.key_start   = "<parameter=";
-        form.key_val_sep = ">";
-        form.val_end     = "</parameter>";
-        form.tool_end    = "</function>";
-        form.scope_end   = "</tool_call>";
-        form.trim_raw_argval = true;
-        return form;
-    })();
-    builder.consume_reasoning_with_xml_tool_calls(form);
-}
-
-static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_KIMI_K2;
-
-    data.preserved_tokens = {
-        "<think>",
-        "</think>",
-        "<|tool_calls_section_begin|>",
-        "<|tool_call_begin|>",
-        "<|tool_call_argument_begin|>",
-        "<|tool_call_end|>",
-        "<|tool_calls_section_end|>",
-        "<|im_end|>",
-        "<|im_system|>",
-        "<|im_middle|>",
-    };
-
-    data.additional_stops.insert(data.additional_stops.end(), {
-        "<|im_end|>",
-        "<|im_middle|>"
-    });
-    // build grammar for tool call
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<|tool_calls_section_begin|>";
-        form.tool_start  = "<|tool_call_begin|>";
-        form.tool_sep    = "<|tool_call_argument_begin|>{";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}<|tool_call_end|>";
-        form.scope_end   = "<|tool_calls_section_end|>";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        return form;
-    })();
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
-static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<|tool_calls_section_begin|>";
-        form.tool_start  = "<|tool_call_begin|>";
-        form.tool_sep    = "<|tool_call_argument_begin|>{";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}<|tool_call_end|>";
-        form.scope_end   = "<|tool_calls_section_end|>";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        return form;
-    })();
-    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
-}
-
-static common_chat_params common_chat_params_init_apriel_1_5(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_APRIEL_1_5;
-
-    data.preserved_tokens = {
-        "<thinking>",
-        "</thinking>",
-        "<tool_calls>",
-        "</tool_calls>",
-    };
-
-    // build grammar for tool call
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<tool_calls>[";
-        form.tool_start  = "{\"name\": \"";
-        form.tool_sep    = "\", \"arguments\": {";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}, ";
-        form.scope_end   = "]</tool_calls>";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        form.last_tool_end = "}";
-        return form;
-    })();
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
-static void common_chat_parse_apriel_1_5(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "<tool_calls>[";
-        form.tool_start  = "{\"name\": \"";
-        form.tool_sep    = "\", \"arguments\": {";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}, ";
-        form.scope_end   = "]</tool_calls>";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        form.last_tool_end = "}";
-        return form;
-    })();
-    builder.consume_reasoning_with_xml_tool_calls(form, "<thinking>", "</thinking>");
-}
-
-static common_chat_params common_chat_params_init_xiaomi_mimo(const common_chat_template & tmpl, const struct templates_params & params) {
-    common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    data.prompt = apply(tmpl, params);
-    data.format = COMMON_CHAT_FORMAT_XIAOMI_MIMO;
-
-    data.preserved_tokens = {
-        "<tool_call>",
-        "</tool_call>",
-    };
-
-    // build grammar for tool call
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "\n";
-        form.tool_start  = "<tool_call>\n{\"name\": \"";
-        form.tool_sep    = "\", \"arguments\": {";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}\n</tool_call>";
-        form.scope_end   = "";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        return form;
-    })();
-    build_grammar_xml_tool_call(data, params.tools, form);
-
-    return data;
-}
-
-static void common_chat_parse_xiaomi_mimo(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form = ([]() {
-        xml_tool_call_format form {};
-        form.scope_start = "";
-        form.tool_start  = "<tool_call>\n{\"name\": \"";
-        form.tool_sep    = "\", \"arguments\": {";
-        form.key_start   = "\"";
-        form.key_val_sep = "\": ";
-        form.val_end     = ", ";
-        form.tool_end    = "}\n</tool_call>";
-        form.scope_end   = "";
-        form.raw_argval  = false;
-        form.last_val_end = "";
-        return form;
-    })();
-    builder.consume_reasoning_with_xml_tool_calls(form);
-}
-
 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;

@@ -2319,100 +2041,6 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
    }
 }

-static common_chat_params common_chat_params_init_glm_4_5(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    data.grammar_lazy = inputs.tools.is_array() && !inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    std::string prompt = apply(tmpl, inputs);
-
-    // match the existing trimming behavior
-    if (inputs.add_bos && string_starts_with(prompt, tmpl.bos_token())) {
-        prompt.erase(0, tmpl.bos_token().size());
-    }
-    if (inputs.add_eos && string_ends_with(prompt, tmpl.eos_token())) {
-        prompt.erase(prompt.size() - tmpl.eos_token().size());
-    }
-    if (string_ends_with(prompt, "<think>")) {
-        if (!inputs.enable_thinking) {
-            prompt += "</think>";
-        } else {
-            data.thinking_forced_open = true;
-        }
-    }
-
-    // add GLM preserved tokens
-    data.preserved_tokens = {
-        "<|endoftext|>",
-        "[MASK]",
-        "[gMASK]",
-        "[sMASK]",
-        "<sop>",
-        "<eop>",
-        "<|system|>",
-        "<|user|>",
-        "<|assistant|>",
-        "<|observation|>",
-        "<|begin_of_image|>",
-        "<|end_of_image|>",
-        "<|begin_of_video|>",
-        "<|end_of_video|>",
-        "<|begin_of_audio|>",
-        "<|end_of_audio|>",
-        "<|begin_of_transcription|>",
-        "<|end_of_transcription|>",
-        "<|code_prefix|>",
-        "<|code_middle|>",
-        "<|code_suffix|>",
-        "/nothink",
-        "<think>",
-        "</think>",
-        "<tool_call>",
-        "</tool_call>",
-        "<arg_key>",
-        "</arg_key>",
-        "<arg_value>",
-        "</arg_value>"
-    };
-
-    // extra GLM 4.5 stop word
-    data.additional_stops.insert(data.additional_stops.end(), {
-        "<|user|>",
-        "<|observation|>"
-    });
-
-    // build grammar for tool call
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "",
-        /* form.tool_start  = */ "\n<tool_call>",
-        /* form.tool_sep    = */ "\n",
-        /* form.key_start   = */ "<arg_key>",
-        /* form.key_val_sep = */ "</arg_key>\n<arg_value>",
-        /* form.val_end     = */ "</arg_value>\n",
-        /* form.tool_end    = */ "</tool_call>\n",
-        /* form.scope_end   = */ "",
-    };
-    build_grammar_xml_tool_call(data, inputs.tools, form);
-
-    data.prompt = prompt;
-    data.format = COMMON_CHAT_FORMAT_GLM_4_5;
-    return data;
-}
-
-static void common_chat_parse_glm_4_5(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form {
-        /* form.scope_start  = */ "",
-        /* form.tool_start   = */ "<tool_call>",
-        /* form.tool_sep     = */ "",
-        /* form.key_start    = */ "<arg_key>",
-        /* form.key_val_sep  = */ "</arg_key>",
-        /* form.val_end      = */ "</arg_value>",
-        /* form.tool_end     = */ "</tool_call>",
-        /* form.scope_end    = */ "",
-        /* form.key_val_sep2 = */ "<arg_value>",
-    };
-    builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
-}
-
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
@@ -3076,17 +2704,91 @@ static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
 }

 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
-    static const xml_tool_call_format form {
-        /* form.scope_start = */ "<seed:tool_call>",
-        /* form.tool_start  = */ "<function=",
-        /* form.tool_sep    = */ ">",
-        /* form.key_start   = */ "<parameter=",
-        /* form.key_val_sep = */ ">",
-        /* form.val_end     = */ "</parameter>",
-        /* form.tool_end    = */ "</function>",
-        /* form.scope_end   = */ "</seed:tool_call>",
-    };
-    builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
+    // Parse thinking tags first - this handles the main reasoning content
+    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
+
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    // Parse tool calls - Seed-OSS uses <seed:tool_call> format
+    static const common_regex tool_call_begin_regex("<seed:tool_call>");
+    static const common_regex tool_call_end_regex("</seed:tool_call>");
+    static const common_regex function_regex("<function=([^>]+)>");
+    static const common_regex param_regex("<parameter=([^>]+)>");
+
+    while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
+        builder.consume_spaces();  // Consume whitespace after <seed:tool_call>
+
+        // Look for function call inside tool call, ignore any content before it
+        if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
+            auto function_name = builder.str(func_res->groups[1]);
+
+            // Parse Seed-OSS parameters <parameter=name>value</parameter>
+            json args = json::object();
+            // Parse all parameters
+            while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
+                // again, ignore noise around parameters
+                auto param_name = builder.str(param_res->groups[1]);
+                builder.move_to(param_res->groups[0].end);
+                builder.consume_spaces();  // Consume whitespace after parameter
+                auto savedPos = builder.pos();
+                if (auto param_parse = builder.try_find_literal("</parameter>")) {
+                    auto param = param_parse->prelude;
+                    builder.move_to(savedPos);
+                    try {
+                        if (auto param_res = builder.try_consume_json()) {
+                            args[param_name] = param_res->json;
+                        } else {
+                            args[param_name] = param;
+                        }
+                    } catch (json::exception &) {
+                        args[param_name] = param;
+                    }
+                } else {
+                    throw common_chat_msg_partial_exception("Incomplete tool parameter");
+                }
+            }
+            // Look for closing function tag
+            auto end_func = builder.try_find_literal("</function>");
+            if (end_func) {
+                builder.move_to(end_func->groups[0].end);
+                builder.consume_spaces();  // Consume whitespace after </function>
+
+                // Add the tool call with parsed arguments, but only if we REALLY got the literal
+                auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
+                auto funlen = std::string("</function>").length();
+                if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
+                    if (!builder.add_tool_call(function_name, "", args.dump())) {
+                        throw common_chat_msg_partial_exception("Incomplete tool call");
+                    }
+                } else {
+                    throw common_chat_msg_partial_exception("Incomplete tool call");
+                }
+            } else {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            // Look for closing tool call tag
+            if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
+                builder.move_to(end_tool->groups[0].end);
+                builder.consume_spaces();  // Consume trailing whitespace after tool call
+            } else {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+        } else {
+            // No function found - don't consume content here, let it be handled at the end
+            break;
+        }
+    }
+
+    // Consume any remaining whitespace after all tool call processing
+    builder.consume_spaces();
+    auto remaining = builder.consume_rest();
+    // If there's any non-whitespace content remaining, add it as content
+    if (!string_strip(remaining).empty()) {
+        builder.add_content(remaining);
+    }
 }

 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -3225,35 +2927,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_granite(tmpl, params);
    }

-    // GLM 4.5: detect by <arg_key> and <arg_value> tags (check before Hermes since both use <tool_call>)
-    if (src.find("[gMASK]<sop>") != std::string::npos &&
-        src.find("<arg_key>") != std::string::npos &&
-        src.find("<arg_value>") != std::string::npos &&
-        params.json_schema.is_null()) {
-        return common_chat_params_init_glm_4_5(tmpl, params);
-    }
-
-    // Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
-    // Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
-    // Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
-    if (src.find("<tool_call>") != std::string::npos &&
-        src.find("<function>") != std::string::npos &&
-        src.find("<function=") != std::string::npos &&
-        src.find("<parameters>") != std::string::npos &&
-        src.find("<parameter=") != std::string::npos) {
-        return common_chat_params_init_qwen3_coder_xml(tmpl, params);
-    }
-
-    // Xiaomi MiMo format detection (must come before Hermes 2 Pro)
-    if (src.find("<tools>") != std::string::npos &&
-        src.find("# Tools") != std::string::npos &&
-        src.find("</tools>") != std::string::npos &&
-        src.find("<tool_calls>") != std::string::npos &&
-        src.find("</tool_calls>") != std::string::npos &&
-        src.find("<tool_response>") != std::string::npos) {
-        return common_chat_params_init_xiaomi_mimo(tmpl, params);
-    }
-
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -3285,29 +2958,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_lfm2(tmpl, params);
    }

-    // MiniMax-M2 format detection
-    if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
-        return common_chat_params_init_minimax_m2(tmpl, params);
-    }
-
-    // Kimi K2 format detection
-    if (src.find("<|im_system|>tool_declare<|im_middle|>") != std::string::npos &&
-        src.find("<|tool_calls_section_begin|>") != std::string::npos &&
-        src.find("## Return of") != std::string::npos) {
-        return common_chat_params_init_kimi_k2(tmpl, params);
-    }
-
-    // Apriel 1.5 format detection
-    if (src.find("<thinking>") != std::string::npos &&
-        src.find("</thinking>") != std::string::npos &&
-        src.find("<available_tools>") != std::string::npos &&
-        src.find("<|assistant|>") != std::string::npos &&
-        src.find("<|tool_result|>") != std::string::npos &&
-        src.find("<tool_calls>[") != std::string::npos &&
-        src.find("]</tool_calls>") != std::string::npos) {
-        return common_chat_params_init_apriel_1_5(tmpl, params);
-    }
-
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -3359,7 +3009,7 @@ static common_chat_params common_chat_templates_apply_legacy(
    const struct common_chat_templates * tmpls,
    const struct common_chat_templates_inputs & inputs)
 {
-    size_t alloc_size = 0;
+    int alloc_size = 0;
    std::vector<llama_chat_message> chat;
    std::vector<std::string> contents;

@@ -3381,8 +3031,7 @@ static common_chat_params common_chat_templates_apply_legacy(
        const auto & msg = inputs.messages[i];
        const auto & content = contents[i];
        chat.push_back({msg.role.c_str(), content.c_str()});
-        size_t msg_size = msg.role.size() + content.size();
-        alloc_size += msg_size + (msg_size / 4); // == msg_size * 1.25 but avoiding float ops
+        alloc_size += (msg.role.size() + content.size()) * 1.25;
    }

    std::vector<char> buf(alloc_size);
@@ -3404,11 +3053,6 @@ static common_chat_params common_chat_templates_apply_legacy(
        res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size());
    }

-    // for safety, we check the result again
-    if (res < 0 || (size_t) res > buf.size()) {
-        throw std::runtime_error("failed to apply chat template, try using --jinja");
-    }
-
    common_chat_params params;
    params.prompt = std::string(buf.data(), res);
    if (!inputs.json_schema.empty()) {
@@ -3495,24 +3139,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
            common_chat_parse_lfm2(builder);
            break;
-        case COMMON_CHAT_FORMAT_MINIMAX_M2:
-            common_chat_parse_minimax_m2(builder);
-            break;
-        case COMMON_CHAT_FORMAT_GLM_4_5:
-            common_chat_parse_glm_4_5(builder);
-            break;
-        case COMMON_CHAT_FORMAT_KIMI_K2:
-            common_chat_parse_kimi_k2(builder);
-            break;
-        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
-            common_chat_parse_qwen3_coder_xml(builder);
-            break;
-        case COMMON_CHAT_FORMAT_APRIEL_1_5:
-            common_chat_parse_apriel_1_5(builder);
-            break;
-        case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
-            common_chat_parse_xiaomi_mimo(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
@@ -117,12 +117,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
    COMMON_CHAT_FORMAT_APERTUS,
    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
-    COMMON_CHAT_FORMAT_GLM_4_5,
-    COMMON_CHAT_FORMAT_MINIMAX_M2,
-    COMMON_CHAT_FORMAT_KIMI_K2,
-    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
-    COMMON_CHAT_FORMAT_APRIEL_1_5,
-    COMMON_CHAT_FORMAT_XIAOMI_MIMO,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -297,25 +297,8 @@ bool common_json_parse(
            it = temptative_end;
            return true;
        }
-        // handle unclosed top-level primitive
-        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;
-            if (can_parse(str + "\"")) {
-                // Was inside an string
-                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
-            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
-                // Was inside an string after an escape
-                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
-            } else {
-                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-                // fprintf(stderr, "Closing: TODO\n");
-                return false;
-            }
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
+        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+        // fprintf(stderr, "Closing: TODO\n");
        return false;
    }
    out.json = json::parse(it, end);
@@ -303,8 +303,6 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

-std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
-
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -18,6 +18,4 @@ struct common_grammar_options {
    bool dotall = false;
 };

-std::string gbnf_format_literal(const std::string & literal);
-
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
@@ -1673,9 +1673,11 @@ class GPTNeoXModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GPTNEOX

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(
            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
@@ -1733,7 +1735,7 @@ class BloomModel(TextModel):
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(4 * n_embed)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -1796,9 +1798,10 @@ class MPTModel(TextModel):
            self.gguf_writer.add_unk_token_id(0)

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
        self.gguf_writer.add_head_count(self.hparams["n_heads"])
        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
@@ -1831,6 +1834,7 @@ class OrionModel(TextModel):
        self._set_vocab_sentencepiece()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -1848,7 +1852,7 @@ class OrionModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
@@ -1865,6 +1869,7 @@ class BaichuanModel(TextModel):
        self._set_vocab_sentencepiece()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -1881,7 +1886,7 @@ class BaichuanModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(head_count)
@@ -1988,6 +1993,7 @@ class XverseModel(TextModel):
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)

@@ -2004,7 +2010,7 @@ class XverseModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(head_count)
@@ -2047,6 +2053,10 @@ class FalconModel(TextModel):
    model_arch = gguf.MODEL_ARCH.FALCON

    def set_gguf_parameters(self):
+        block_count = self.hparams.get("num_hidden_layers")
+        if block_count is None:
+            block_count = self.hparams["n_layer"]  # old name
+
        n_head = self.hparams.get("num_attention_heads")
        if n_head is None:
            n_head = self.hparams["n_head"]  # old name
@@ -2059,7 +2069,7 @@ class FalconModel(TextModel):
        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -2097,10 +2107,12 @@ class StarCoderModel(TextModel):
    model_arch = gguf.MODEL_ARCH.STARCODER

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(1)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -2130,12 +2142,14 @@ class RefactModel(TextModel):
        multiple_of = 256
        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)

+        block_count = self.hparams["n_layer"]
+
        # refact uses Alibi. So this is from config.json which might be used by training.
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])

        self.gguf_writer.add_feed_forward_length(ff_dim)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(1)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
@@ -2182,10 +2196,11 @@ class StableLMModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -3136,7 +3151,7 @@ class DbrxModel(TextModel):
    def set_gguf_parameters(self):
        ffn_config = self.hparams["ffn_config"]
        attn_config = self.hparams["attn_config"]
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layers"])

        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
@@ -3338,7 +3353,7 @@ class QwenModel(TextModel):

    def set_gguf_parameters(self):
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
@@ -4369,7 +4384,7 @@ class GPT2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GPT2

    def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -4401,6 +4416,8 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
        rot_pct = self.find_hparam(["partial_rotary_factor"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -4409,7 +4426,7 @@ class Phi2Model(TextModel):

        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(4 * n_embd)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
@@ -4527,6 +4544,8 @@ class Phi3MiniModel(TextModel):
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
@@ -4540,7 +4559,7 @@ class Phi3MiniModel(TextModel):
        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
@@ -4660,11 +4679,12 @@ class PlamoModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(4096)  # not in config.json
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
@@ -4787,6 +4807,7 @@ class Plamo2Model(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

        # Which layers are Mamba layers
@@ -4798,10 +4819,10 @@ class Plamo2Model(TextModel):
        num_attention_heads = []

        if mamba_enabled:
-            for i in range(self.block_count):
-                if self.block_count <= (mamba_step // 2):
+            for i in range(block_count):
+                if block_count <= (mamba_step // 2):
                    # use attention in last layer
-                    is_mamba = (i != self.block_count - 1)
+                    is_mamba = (i != block_count - 1)
                else:
                    is_mamba = (i % mamba_step) != (mamba_step // 2)
                if is_mamba:
@@ -4819,7 +4840,7 @@ class Plamo2Model(TextModel):
        self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
        self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
        self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))

@@ -4876,10 +4897,12 @@ class CodeShellModel(TextModel):
    model_arch = gguf.MODEL_ARCH.CODESHELL

    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -5021,7 +5044,7 @@ class InternLM2Model(TextModel):

    def set_gguf_parameters(self):
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@@ -5642,10 +5665,11 @@ class GemmaModel(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5681,10 +5705,11 @@ class Gemma2Model(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5728,11 +5753,12 @@ class Gemma3Model(TextModel):

    def set_gguf_parameters(self):
        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]

        # some default values are not specified in the hparams
        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
@@ -6008,6 +6034,7 @@ class Rwkv6Model(TextModel):
        self._set_vocab_rwkv_world()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        head_size = self.hparams["head_size"]
        hidden_size = self.hparams["hidden_size"]
        layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -6019,7 +6046,7 @@ class Rwkv6Model(TextModel):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
        self.gguf_writer.add_wkv_head_size(head_size)
@@ -6083,6 +6110,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        num_attention_heads = self.hparams["num_attention_heads"]
        num_key_value_heads = self.hparams["num_key_value_heads"]
        hidden_size = self.hparams["hidden_size"]
@@ -6095,7 +6123,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
@@ -6136,6 +6164,7 @@ class Rwkv7Model(TextModel):
        return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        try:
            head_size = self.hparams["head_size"]
            layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -6160,7 +6189,7 @@ class Rwkv7Model(TextModel):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -6254,6 +6283,7 @@ class ARwkv7Model(Rwkv7Model):
            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
        hidden_size = self.hparams["hidden_size"]
        head_size = self.hparams["head_size"]
        rms_norm_eps = self.hparams["rms_norm_eps"]
@@ -6270,7 +6300,7 @@ class ARwkv7Model(Rwkv7Model):
        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
        self.gguf_writer.add_wkv_head_size(head_size)
        self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -7494,7 +7524,7 @@ class T5Model(TextModel):
        self.gguf_writer.add_context_length(n_ctx)
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
            self.gguf_writer.add_decoder_block_count(dec_n_layer)
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
@@ -7633,7 +7663,7 @@ class T5EncoderModel(TextModel):
        self.gguf_writer.add_context_length(n_ctx)
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
        self.gguf_writer.add_key_length(self.hparams["d_kv"])
        self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -7696,7 +7726,7 @@ class JaisModel(TextModel):
        self._set_vocab_gpt2()

    def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
@@ -8038,7 +8068,7 @@ class ChatGLMModel(TextModel):
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
@@ -8120,6 +8150,7 @@ class ExaoneModel(TextModel):
        num_kv_heads = hparams.get("num_key_value_heads", num_heads)
        layer_norm_eps = hparams["layer_norm_epsilon"]
        intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
+        num_layers = hparams["num_layers"]
        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
        # attention_dropout_rate = hparams["attention_dropout"]
        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
@@ -8130,7 +8161,7 @@ class ExaoneModel(TextModel):
        self.gguf_writer.add_context_length(max_position_embeddings)
        self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
        self.gguf_writer.add_feed_forward_length(intermediate_size)
-        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_block_count(num_layers)
        self.gguf_writer.add_file_type(self.ftype)

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
@@ -17,12 +17,12 @@ Legend:
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
@@ -43,9 +43,9 @@ Legend:
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
@@ -87,7 +87,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -99,7 +99,7 @@ Legend:
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
 |                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -107,7 +107,7 @@ Legend:
 |                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
 |                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
@@ -116,6 +116,6 @@ Legend:
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -5,8 +5,8 @@
 "Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","TANH","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","TANH","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","ELU","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
@@ -29,18 +29,18 @@
 "Vulkan0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
 "Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
-"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
 "Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
@@ -89,8 +89,8 @@
 "Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","TANH","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","TANH","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","ELU","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
@@ -113,18 +113,18 @@
 "Vulkan0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
 "Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
 "Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
-"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
 "Vulkan0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
 "Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
 "Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan"
@@ -5654,7 +5654,7 @@
 "Vulkan0","SUB","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
 "Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
-"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
+"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan"
 "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan"
@@ -8632,10 +8632,10 @@
 "Vulkan0","COS","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
@@ -8643,10 +8643,10 @@
 "Vulkan0","COS","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","Vulkan"
-"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
@@ -8654,10 +8654,10 @@
 "Vulkan0","COS","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
 "Vulkan0","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[10,2,2,2]","support","0","no","Vulkan"
 "Vulkan0","SQR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","SQRT","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","LOG","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
@@ -8665,10 +8665,10 @@
 "Vulkan0","COS","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
 "Vulkan0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","Vulkan"
-"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
-"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
+"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
+"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","0","no","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
 "Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","Vulkan"
@@ -9478,7 +9478,7 @@
 "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
 "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
 "Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan"
-"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","Vulkan"
+"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","0","no","Vulkan"
 "Vulkan0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","Vulkan"
 "Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
 "Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
@@ -9487,9 +9487,9 @@
 "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","Vulkan"
 "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","Vulkan"
 "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Vulkan"
-"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","Vulkan"
+"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","Vulkan"
 "Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Vulkan"
 "Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Vulkan"
 "Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Vulkan"
@@ -392,9 +392,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
-                list(APPEND ARCH_FLAGS -mcpu=power10)
+                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
            elseif (EXTRACTED_NUMBER EQUAL 9)
-                list(APPEND ARCH_FLAGS -mcpu=power9)
+                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
            else()
@@ -384,8 +384,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    char * src1_ddc = (char *) src1->data;

    const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
-    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
-        src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
+    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;

    if (src0->type == src1->type && contiguous_srcs) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
@@ -3001,10 +3001,6 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 static bool ggml_cuda_should_fuse_rope_set_rows(const ggml_tensor * rope,
                                                const ggml_tensor * view,
                                                const ggml_tensor * set_rows) {
-
-    if (rope->op != GGML_OP_ROPE || view->op != GGML_OP_VIEW || set_rows->op != GGML_OP_SET_ROWS) {
-        return false;
-    }
    // ne3 not tested
    if (rope->src[0]->ne[3] != 1) {
        return false;
@@ -406,8 +406,8 @@ enum shader_reduction_mode {
    SHADER_REDUCTION_MODE_COUNT,
 };

-// argsort pipelines for up to 1<<10 invocations per workgroup
 static constexpr uint32_t num_argsort_pipelines = 11;
+static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
 static constexpr uint32_t num_topk_moe_pipelines = 10;

 static constexpr std::initializer_list<ggml_op> topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
@@ -526,7 +526,6 @@ struct vk_device_struct {
    bool multi_add;
    bool shader_int64;
    bool buffer_device_address;
-    bool vulkan_memory_model;

    bool add_rms_fusion;
    uint32_t partials_binding_alignment;
@@ -540,9 +539,6 @@ struct vk_device_struct {
    uint32_t subgroup_max_size;
    bool subgroup_require_full_support;

-    // floor(log2(maxComputeWorkGroupInvocations))
-    uint32_t max_workgroup_size_log2 {};
-
    bool coopmat_support;
    bool coopmat_acc_f32_support {};
    bool coopmat_acc_f16_support {};
@@ -642,7 +638,6 @@ struct vk_device_struct {
    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
    vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
    vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32;
    vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT];
    vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT];
    vk_pipeline pipeline_norm_f32;
@@ -669,20 +664,6 @@ struct vk_device_struct {
    vk_pipeline pipeline_hardsigmoid[2];
    vk_pipeline pipeline_hardswish[2];
    vk_pipeline pipeline_abs[2];
-    vk_pipeline pipeline_softplus[2];
-    vk_pipeline pipeline_step[2];
-    vk_pipeline pipeline_round[2];
-    vk_pipeline pipeline_ceil[2];
-    vk_pipeline pipeline_floor[2];
-    vk_pipeline pipeline_trunc[2];
-
-    vk_pipeline pipeline_add1_f16_f16;
-    vk_pipeline pipeline_add1_f16_f32;
-    vk_pipeline pipeline_add1_f32_f32;
-
-    vk_pipeline pipeline_arange_f32;
-
-    vk_pipeline pipeline_fill_f32;

    vk_pipeline pipeline_geglu[2];
    vk_pipeline pipeline_reglu[2];
@@ -702,7 +683,6 @@ struct vk_device_struct {
    vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
    vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16;
    vk_pipeline pipeline_argsort_f32[num_argsort_pipelines];
-    vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
    vk_pipeline pipeline_sum_rows_f32;
    vk_pipeline pipeline_argmax_f32;
    vk_pipeline pipeline_count_equal_i32;
@@ -1193,14 +1173,8 @@ struct vk_op_soft_max_push_constants {

 struct vk_op_argsort_push_constants {
    uint32_t ncols;
-    uint32_t ncols_padded;
-    uint32_t ncols_padded_log2;
    uint32_t nrows;
-    uint32_t order;
-    uint32_t outer_start;
-    uint32_t outer_end;
-    uint32_t inner_start;
-    uint32_t inner_end;
+    int32_t order;
 };

 struct vk_op_im2col_push_constants {
@@ -2927,15 +2901,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
            if (path == FAPATH) { \
                if (aligned) { \
                    if (f32acc) { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, true, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
                    } else { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, true, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
                    } \
                } else { \
                    if (f32acc) { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1,                                        true, true, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ##            SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
                    } else { \
-                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1,                                        true, true, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+                        ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1,                                        true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
                    } \
                } \
            } \
@@ -3723,9 +3697,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

-    ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
-
    if (device->float_controls_rte_fp16) {
        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
@@ -3855,12 +3826,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    CREATE_UNARY(hardsigmoid)
    CREATE_UNARY(hardswish)
    CREATE_UNARY(abs)
-    CREATE_UNARY(softplus)
-    CREATE_UNARY(step)
-    CREATE_UNARY(round)
-    CREATE_UNARY(ceil)
-    CREATE_UNARY(floor)
-    CREATE_UNARY(trunc)
 #undef CREATE_UNARY

 #define CREATE_UNARY_RTE(name)  \
@@ -3874,14 +3839,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
    CREATE_UNARY_RTE(exp)
 #undef CREATE_UNARY_RTE

-    ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
-
 #define CREATE_GLU(name)  \
    if (device->float_controls_rte_fp16) {  \
        ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);   \
@@ -3934,15 +3891,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    }

    for (uint32_t i = 0; i < num_argsort_pipelines; ++i) {
-        uint32_t BLOCK_SIZE = 1u << std::min(i, device->max_workgroup_size_log2);
-        if (i <= device->max_workgroup_size_log2 &&
-            2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) {
-            const uint32_t NCOLS_PADDED_LOG2 = i;
-            ggml_vk_create_pipeline2(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true);
-        }
-        const uint32_t WG_UNROLL_FACTOR = BLOCK_SIZE > 1 ? 2 : 1;
-        BLOCK_SIZE /= WG_UNROLL_FACTOR;
-        ggml_vk_create_pipeline2(device, device->pipeline_argsort_large_f32[i], "argsort_large_f32_"+std::to_string(i), argsort_large_f32_len, argsort_large_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE * WG_UNROLL_FACTOR, 1, 1}, {BLOCK_SIZE, WG_UNROLL_FACTOR}, 1, true);
+        ggml_vk_create_pipeline2(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1u<<i, 1, 1}, {1u<<i, i}, 1, true);
    }

    ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -4343,8 +4292,6 @@ static vk_device ggml_vk_get_device(size_t idx) {

        device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated;

-        device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations)));
-
        std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();

        // Try to find a non-graphics compute queue and transfer-focused queues
@@ -4484,7 +4431,6 @@ static vk_device ggml_vk_get_device(size_t idx) {

        device->shader_int64 = device_features2.features.shaderInt64;
        device->buffer_device_address = vk12_features.bufferDeviceAddress;
-        device->vulkan_memory_model = vk12_features.vulkanMemoryModel;

        if (device->subgroup_size_control) {
            device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
@@ -6301,17 +6247,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
    // Choose "contiguous copy" shader if src/dst are contiguous
    bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));

-    // Use optimized "transpose" shader if src dim1 is the innermost dimension.
-    bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src);
-
-    if (transpose && src->type == to) {
-        if (ggml_type_size(to) == 4) {
-            return ctx->device->pipeline_cpy_transpose_32;
-        } else if (ggml_type_size(to) == 2) {
-            return ctx->device->pipeline_cpy_transpose_16;
-        }
-    }
-
    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
        if (contig) {
            return ctx->device->pipeline_contig_cpy_f32_f32;
@@ -8307,18 +8242,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
                return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16];
            case GGML_UNARY_OP_ABS:
                return ctx->device->pipeline_abs[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_SOFTPLUS:
-                return ctx->device->pipeline_softplus[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_STEP:
-                return ctx->device->pipeline_step[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_ROUND:
-                return ctx->device->pipeline_round[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_CEIL:
-                return ctx->device->pipeline_ceil[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_FLOOR:
-                return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16];
-            case GGML_UNARY_OP_TRUNC:
-                return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16];
            default:
                break;
        }
@@ -8421,6 +8344,19 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            }
            return nullptr;
        }
+    case GGML_OP_ARGSORT:
+        if (ctx->num_additional_fused_ops) {
+            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
+            GGML_ASSERT(idx < num_topk_moe_pipelines);
+            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
+            return ctx->device->pipeline_topk_moe[idx][mode];
+        }
+
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
+            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
+            return ctx->device->pipeline_argsort_f32[idx];
+        }
+        return nullptr;
    case GGML_OP_SUM:
    case GGML_OP_SUM_ROWS:
    case GGML_OP_MEAN:
@@ -8513,7 +8449,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    case GGML_OP_CONV_TRANSPOSE_2D:
        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
            ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
-            std::array<uint32_t, 3> elements{};
+            std::array<uint32_t, 3> elements;
            if (op == GGML_OP_CONV_2D) elements = ggml_vk_get_conv_elements(dst);
            else if (op == GGML_OP_CONV_TRANSPOSE_2D) elements = ggml_vk_get_conv_transpose_2d_elements(dst);
            vk_conv_shapes shape;
@@ -8591,27 +8527,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            }
        }
        return nullptr;
-    case GGML_OP_ADD1:
-        if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-            return ctx->device->pipeline_add1_f16_f16;
-        }
-        if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
-            return ctx->device->pipeline_add1_f16_f32;
-        }
-        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_add1_f32_f32;
-        }
-        return nullptr;
-    case GGML_OP_ARANGE:
-        if (dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_arange_f32;
-        }
-        return nullptr;
-    case GGML_OP_FILL:
-        if (dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_fill_f32;
-        }
-        return nullptr;
    default:
        return nullptr;
    }
@@ -8833,7 +8748,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
        break;
    case GGML_OP_ARGSORT:
-        GGML_ASSERT(0);
+        elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
+        elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
        break;
    case GGML_OP_IM2COL:
        {
@@ -8901,9 +8817,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_SUB:
    case GGML_OP_DIV:
    case GGML_OP_MUL:
-    case GGML_OP_ADD1:
-    case GGML_OP_ARANGE:
-    case GGML_OP_FILL:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SQRT:
@@ -8945,17 +8858,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
            } else {
                elements = { ne, 1, 1 };
            }
-
-            if (pipeline == ctx->device->pipeline_cpy_transpose_32 ||
-                pipeline == ctx->device->pipeline_cpy_transpose_16) {
-                // 32x32 tiles
-                elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32);
-                elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32);
-                elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]);
-                elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]);
-                elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
-                elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
-            }
        } break;
    case GGML_OP_ADD_ID:
        {
@@ -9521,63 +9423,6 @@ static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, cons
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst));
 }

-static void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
-    const uint32_t src1_type_size = ggml_type_size(src1->type);
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
-
-    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD1, {
-        (uint32_t)ggml_nelements(src0),
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
-        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
-        0,
-        0.0f, 0.0f, 0,
-    });
-}
-
-static void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
-    VK_LOG_DEBUG("ggml_vk_arange(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")");
-
-    vk_op_push_constants pc = {
-        (uint32_t)ggml_nelements(dst),
-        1,
-        ggml_get_op_params_f32(dst, 0),
-        ggml_get_op_params_f32(dst, 2),
-    };
-
-    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE);
-    GGML_ASSERT(pipeline != nullptr);
-
-    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
-
-    std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(dst), 1, 1 };
-
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements);
-}
-
-static void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
-    VK_LOG_DEBUG("ggml_vk_fill(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")");
-
-    vk_op_push_constants pc = {
-        (uint32_t)ggml_nelements(dst),
-        1,
-        ggml_get_op_params_f32(dst, 0),
-        0.0f,
-    };
-
-    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL);
-    GGML_ASSERT(pipeline != nullptr);
-
-    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false);
-
-    std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(dst), 1, 1 };
-
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements);
-}
-
 static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst));
 }
@@ -10020,89 +9865,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
 }

 static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    const uint32_t * op_params = (const uint32_t *)dst->op_params;
+    int32_t * op_params = (int32_t *)dst->op_params;

    uint32_t ncols = src0->ne[0];
    uint32_t nrows = ggml_nrows(src0);

-    uint32_t ncols_pad_log2 = (uint32_t)ceilf(log2f(float(ncols)));
-    uint32_t ncolsp2 = 1 << ncols_pad_log2;
-
-    vk_op_argsort_push_constants pc { ncols, ncolsp2, ncols_pad_log2, nrows, op_params[0], 0, 0, 0, 0, };
-
-    // Pick the largest workgroup size <= ncolsp2
-    uint32_t pipeline_idx = std::min(ncols_pad_log2, num_argsort_pipelines - 1);
-
-    // Use the "small" argsort shader if the whole sort can be done by a single workgroup.
-    bool use_small = ncols_pad_log2 <= ctx->device->max_workgroup_size_log2 &&
-                     ctx->device->pipeline_argsort_f32[pipeline_idx] != nullptr;
-
-    vk_pipeline pipeline = use_small ? ctx->device->pipeline_argsort_f32[pipeline_idx]
-                                     : ctx->device->pipeline_argsort_large_f32[pipeline_idx];
-
-    vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0);
-    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
-    vk_subbuffer subbuf1 = dst_buf;
-
-    // Reserve space for ivec2 per element, with rows padded to a power of two
-    if (!use_small) {
-        const size_t x_sz = size_t{ncolsp2} * nrows * 2 * sizeof(int);
-
-        if (ctx->prealloc_size_x < x_sz) {
-            ctx->prealloc_size_x = x_sz;
-            ggml_vk_preallocate_buffers(ctx, subctx);
-        }
-        if (ctx->prealloc_x_need_sync) {
-            ggml_vk_sync_buffers(ctx, subctx);
-        }
-        subbuf1 = { ctx->prealloc_x, 0, ctx->prealloc_x->size };
-    }
-
-    std::array<uint32_t, 3> elements;
-
-    elements[0] = ncolsp2;
-    elements[1] = std::min((uint32_t)ggml_nrows(src0), ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
-    elements[2] = 1;
-
-    // First dispatch initializes tmp_idx and does the first N passes where
-    // there is only communication between threads in the same workgroup.
-    {
-        vk_op_argsort_push_constants pc2 = pc;
-        pc2.outer_start = 0;
-        pc2.outer_end = std::min(ncols_pad_log2, ctx->device->max_workgroup_size_log2);
-        pc2.inner_start = 0;
-        pc2.inner_end = 100;
-        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements);
-    }
-    if (!use_small) {
-        ggml_vk_sync_buffers(ctx, subctx);
-        // Loop over outer/inner passes, synchronizing between each pass.
-        for (uint32_t outer = ctx->device->max_workgroup_size_log2; outer < ncols_pad_log2; ++outer) {
-            for (uint32_t inner = 0; inner < outer + 1; ++inner) {
-                vk_op_argsort_push_constants pc2 = pc;
-                pc2.outer_start = outer;
-                pc2.outer_end = outer + 1;
-                pc2.inner_start = inner;
-                pc2.inner_end = inner + 1;
-                // When the inner idx is large enough, there's only communication
-                // within a workgroup. So the remaining inner iterations can all
-                // run in the same dispatch.
-                if (outer - inner < pipeline_idx) {
-                    pc2.inner_end = 100;
-                    inner = outer;
-                    pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx];
-                } else {
-                    // Smaller workgroup empirically seems to perform better
-                    pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx - 2];
-                }
-                ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
-                ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements);
-                ggml_vk_sync_buffers(ctx, subctx);
-            }
-        }
-        ctx->prealloc_x_need_sync = true;
-    }
+    ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
+        ncols,
+        nrows,
+        op_params[0],
+    });
 }

 static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -11410,12 +11182,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        case GGML_UNARY_OP_HARDSIGMOID:
        case GGML_UNARY_OP_HARDSWISH:
        case GGML_UNARY_OP_ABS:
-        case GGML_UNARY_OP_SOFTPLUS:
-        case GGML_UNARY_OP_STEP:
-        case GGML_UNARY_OP_ROUND:
-        case GGML_UNARY_OP_CEIL:
-        case GGML_UNARY_OP_FLOOR:
-        case GGML_UNARY_OP_TRUNC:
            break;
        default:
            return false;
@@ -11457,9 +11223,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_SUB:
    case GGML_OP_MUL:
    case GGML_OP_DIV:
-    case GGML_OP_ADD1:
-    case GGML_OP_ARANGE:
-    case GGML_OP_FILL:
    case GGML_OP_CONCAT:
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
@@ -11672,18 +11435,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_UPSCALE:
        ggml_vk_upscale(ctx, compute_ctx, src0, node);

-        break;
-    case GGML_OP_ADD1:
-        ggml_vk_add1(ctx, compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_ARANGE:
-        ggml_vk_arange(ctx, compute_ctx, node);
-
-        break;
-    case GGML_OP_FILL:
-        ggml_vk_fill(ctx, compute_ctx, node);
-
        break;
    case GGML_OP_SCALE:
        ggml_vk_scale(ctx, compute_ctx, src0, node);
@@ -11768,12 +11519,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        case GGML_UNARY_OP_HARDSIGMOID:
        case GGML_UNARY_OP_HARDSWISH:
        case GGML_UNARY_OP_ABS:
-        case GGML_UNARY_OP_SOFTPLUS:
-        case GGML_UNARY_OP_STEP:
-        case GGML_UNARY_OP_ROUND:
-        case GGML_UNARY_OP_CEIL:
-        case GGML_UNARY_OP_FLOOR:
-        case GGML_UNARY_OP_TRUNC:
            ggml_vk_unary(ctx, compute_ctx, src0, node);
            break;
        default:
@@ -11976,9 +11721,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
    case GGML_OP_SUB:
    case GGML_OP_MUL:
    case GGML_OP_DIV:
-    case GGML_OP_ADD1:
-    case GGML_OP_ARANGE:
-    case GGML_OP_FILL:
    case GGML_OP_ADD_ID:
    case GGML_OP_CONCAT:
    case GGML_OP_UPSCALE:
@@ -12050,12 +11792,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
        case GGML_UNARY_OP_HARDSIGMOID:
        case GGML_UNARY_OP_HARDSWISH:
        case GGML_UNARY_OP_ABS:
-        case GGML_UNARY_OP_SOFTPLUS:
-        case GGML_UNARY_OP_STEP:
-        case GGML_UNARY_OP_ROUND:
-        case GGML_UNARY_OP_CEIL:
-        case GGML_UNARY_OP_FLOOR:
-        case GGML_UNARY_OP_TRUNC:
            buf = tensor->buffer;
            break;
        default:
@@ -13658,12 +13394,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_ABS:
-                case GGML_UNARY_OP_SOFTPLUS:
-                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_ROUND:
-                case GGML_UNARY_OP_CEIL:
-                case GGML_UNARY_OP_FLOOR:
-                case GGML_UNARY_OP_TRUNC:
                    return ggml_is_contiguous(op->src[0]) &&
                           (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
@@ -13965,25 +13695,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_LOG:
            return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_ARGSORT:
-            {
-                if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
-                    return false;
-                }
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                auto device = ggml_vk_get_device(ctx->device);
-                // pipeline_argsort_large_f32 requires vulkan memory model.
-                if (device->vulkan_memory_model) {
-                    return true;
-                } else {
-                    return op->ne[0] <= (1 << device->max_workgroup_size_log2);
-                }
-            }
+            return op->ne[0] <= max_argsort_cols;
        case GGML_OP_UPSCALE:
        case GGML_OP_ACC:
        case GGML_OP_CONCAT:
-        case GGML_OP_ADD1:
-        case GGML_OP_ARANGE:
-        case GGML_OP_FILL:
        case GGML_OP_SCALE:
        case GGML_OP_PAD:
        case GGML_OP_ROLL:
@@ -14466,16 +14181,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
        } else if (tensor->op == GGML_OP_SCALE) {
            const float * params = (const float *)tensor->op_params;
            tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
-        } else if (tensor->op == GGML_OP_ADD1) {
-            tensor_clone = ggml_add1(ggml_ctx, src_clone[0], src_clone[1]);
-        } else if (tensor->op == GGML_OP_ARANGE) {
-            const float start = ggml_get_op_params_f32(tensor, 0);
-            const float stop = ggml_get_op_params_f32(tensor, 1);
-            const float step = ggml_get_op_params_f32(tensor, 2);
-            tensor_clone = ggml_arange(ggml_ctx, start, stop, step);
-        } else if (tensor->op == GGML_OP_FILL) {
-            const float value = ggml_get_op_params_f32(tensor, 0);
-            tensor_clone = ggml_fill(ggml_ctx, tensor_clone, value);
        } else if (tensor->op == GGML_OP_SQR) {
            tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_SQRT) {
@@ -14589,24 +14294,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            case GGML_UNARY_OP_ABS:
                tensor_clone = ggml_abs(ggml_ctx, src_clone[0]);
                break;
-            case GGML_UNARY_OP_SOFTPLUS:
-                tensor_clone = ggml_softplus(ggml_ctx, src_clone[0]);
-                break;
-            case GGML_UNARY_OP_STEP:
-                tensor_clone = ggml_step(ggml_ctx, src_clone[0]);
-                break;
-            case GGML_UNARY_OP_ROUND:
-                tensor_clone = ggml_round(ggml_ctx, src_clone[0]);
-                break;
-            case GGML_UNARY_OP_CEIL:
-                tensor_clone = ggml_ceil(ggml_ctx, src_clone[0]);
-                break;
-            case GGML_UNARY_OP_FLOOR:
-                tensor_clone = ggml_floor(ggml_ctx, src_clone[0]);
-                break;
-            case GGML_UNARY_OP_TRUNC:
-                tensor_clone = ggml_trunc(ggml_ctx, src_clone[0]);
-                break;
            default:
                std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
                GGML_ABORT("fatal error");
@@ -1,28 +0,0 @@
-#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-#include "types.glsl"
-#include "generic_binary_head.glsl"
-
-const uint num_threads = 256;
-
-layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
-    uint idx = get_idx();
-
-    const uint num_iter = 2;
-
-    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
-        if (idx >= p.ne) {
-            continue;
-        }
-        uint i00, i01, i02, i03;
-        get_indices(idx, i00, i01, i02, i03);
-
-        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset()]));
-
-        idx += num_threads;
-    }
-}
@@ -1,20 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    // p.param1 = start, p.param2 = step
-    float value = p.param1 + p.param2 * float(i);
-    data_d[i] = D_TYPE(value);
-}
@@ -4,27 +4,28 @@
 #include "types.glsl"

 layout(constant_id = 0) const int BLOCK_SIZE = 1024;
-layout(constant_id = 1) const int NCOLS_PADDED_LOG2 = 10;
+layout(constant_id = 1) const int BLOCK_SIZE_LOG2 = 10;
 #define ASC 0

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 2) writeonly buffer D {int data_d[];};
+layout (binding = 1)          buffer D {int data_d[];};

 layout (push_constant) uniform parameter {
    uint ncols;
-    uint ncols_padded;
-    uint ncols_padded_log2;
    uint nrows;
    uint order;
-    uint outer_start;
-    uint outer_end;
-    uint inner_start;
-    uint inner_end;
 } p;

-shared ivec2 dst_row[BLOCK_SIZE];
+shared int dst_row[BLOCK_SIZE];
+shared A_TYPE a_sh[BLOCK_SIZE];
+
+void swap(uint idx0, uint idx1) {
+    int tmp = dst_row[idx0];
+    dst_row[idx0] = dst_row[idx1];
+    dst_row[idx1] = tmp;
+}

 void argsort(bool needs_bounds_check, const uint row) {
    // bitonic sort
@@ -33,10 +34,11 @@ void argsort(bool needs_bounds_check, const uint row) {
    const uint row_offset = row * p.ncols;

    // initialize indices
-    dst_row[col] = ivec2(col, floatBitsToInt(data_a[row_offset + col]));
+    dst_row[col] = col;
+    a_sh[col] = data_a[row_offset + col];
    barrier();

-    uint num_outer_loop_iters = NCOLS_PADDED_LOG2;
+    uint num_outer_loop_iters = BLOCK_SIZE_LOG2;
    [[unroll]] for (uint k = 2, outer_idx = 0; outer_idx < num_outer_loop_iters; k *= 2, outer_idx++) {
        uint num_inner_loop_iters = outer_idx + 1;
        [[unroll]] for (uint j = k / 2, inner_idx = 0; inner_idx < num_inner_loop_iters; j /= 2, inner_idx++) {
@@ -45,15 +47,14 @@ void argsort(bool needs_bounds_check, const uint row) {
            int idx_0 = (col & k) == 0 ? col : ixj;
            int idx_1 = (col & k) == 0 ? ixj : col;

-            ivec2 sh_idx_0 = dst_row[idx_0];
-            ivec2 sh_idx_1 = dst_row[idx_1];
-            bool idx_0_oob = needs_bounds_check ? sh_idx_0.x >= p.ncols : false;
-            bool idx_1_oob = needs_bounds_check ? sh_idx_1.x >= p.ncols : false;
+            int sh_idx_0 = dst_row[idx_0];
+            int sh_idx_1 = dst_row[idx_1];
+            bool idx_0_oob = needs_bounds_check ? sh_idx_0 >= p.ncols : false;
+            bool idx_1_oob = needs_bounds_check ? sh_idx_1 >= p.ncols : false;

            if ((idx_0_oob ||
-                (!idx_1_oob && intBitsToFloat(sh_idx_0.y) > intBitsToFloat(sh_idx_1.y))) && (ixj > col)) {
-                dst_row[idx_0] = sh_idx_1;
-                dst_row[idx_1] = sh_idx_0;
+                (!idx_1_oob && a_sh[sh_idx_0] > a_sh[sh_idx_1])) && (ixj > col)) {
+                swap(idx_0, idx_1);
            }

            barrier();
@@ -62,9 +63,9 @@ void argsort(bool needs_bounds_check, const uint row) {

    if (col < p.ncols) {
        if (p.order == ASC) {
-            data_d[row_offset + col] = dst_row[col].x;
+            data_d[row_offset + col] = dst_row[col];
        } else {
-            data_d[row_offset + p.ncols - col - 1] = dst_row[col].x;
+            data_d[row_offset + p.ncols - col - 1] = dst_row[col];
        }
    }
 }
@@ -1,114 +0,0 @@
-#version 450
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_KHR_memory_scope_semantics : enable
-#pragma use_vulkan_memory_model
-
-#include "types.glsl"
-
-layout(constant_id = 0) const int BLOCK_SIZE = 1024;
-layout(constant_id = 1) const int WG_UNROLL_FACTOR = 2;
-#define ASC 0
-
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) workgroupcoherent buffer B {ivec2 tmp_idx[];};
-layout (binding = 2) workgroupcoherent buffer D {int data_d[];};
-
-layout (push_constant) uniform parameter {
-    uint ncols;
-    uint ncols_padded;
-    uint ncols_padded_log2;
-    uint nrows;
-    uint order;
-    uint outer_start;
-    uint outer_end;
-    uint inner_start;
-    uint inner_end;
-} p;
-
-void argsort(bool needs_bounds_check, const uint row) {
-    // bitonic sort
-    int col = int(gl_GlobalInvocationID.x);
-    col = (col % BLOCK_SIZE) + (col / BLOCK_SIZE) * BLOCK_SIZE * WG_UNROLL_FACTOR;
-
-    const uint row_offset = row * p.ncols;
-    uint idx_offset = row * p.ncols_padded;
-
-    bool need_barrier = false;
-
-    // initialize indices
-    if (p.outer_start == 0 && p.inner_start == 0) {
-        [[unroll]] for (int u = 0; u < WG_UNROLL_FACTOR; ++u) {
-            uint c = u*BLOCK_SIZE + col;
-            if (c < p.ncols_padded) {
-                ivec2 v = ivec2(c, floatBitsToInt(data_a[row_offset + c]));
-                tmp_idx[idx_offset + c] = v;
-            }
-        }
-        need_barrier = true;
-    }
-
-    [[unroll]] for (uint outer_idx = p.outer_start, k = (2 << outer_idx); outer_idx < p.outer_end; k *= 2, outer_idx++) {
-        uint inner_end = min(p.inner_end, outer_idx + 1);
-        for (uint j = k >> (p.inner_start + 1), inner_idx = p.inner_start; inner_idx < inner_end; j /= 2, inner_idx++) {
-            if (need_barrier) {
-                controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease);
-            }
-            need_barrier = true;
-            [[unroll]] for (int u = 0; u < WG_UNROLL_FACTOR; ++u) {
-                int c = u*BLOCK_SIZE + col;
-                const int ixj = int(c ^ j);
-
-                if (ixj < c) {
-                    continue;
-                }
-
-                int idx_0 = (c & k) == 0 ? c : ixj;
-                int idx_1 = (c & k) == 0 ? ixj : c;
-
-                ivec2 sh_idx_0 = tmp_idx[idx_offset + idx_0];
-                ivec2 sh_idx_1 = tmp_idx[idx_offset + idx_1];
-                bool idx_0_oob = needs_bounds_check ? sh_idx_0.x >= p.ncols : false;
-                bool idx_1_oob = needs_bounds_check ? sh_idx_1.x >= p.ncols : false;
-
-                if ((idx_0_oob ||
-                    (!idx_1_oob && intBitsToFloat(sh_idx_0.y) > intBitsToFloat(sh_idx_1.y)))) {
-                    tmp_idx[idx_offset + idx_0] = sh_idx_1;
-                    tmp_idx[idx_offset + idx_1] = sh_idx_0;
-                }
-            }
-        }
-    }
-
-    if (p.outer_end == p.ncols_padded_log2 &&
-        p.inner_end >= p.ncols_padded_log2 + 1) {
-        controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease);
-        [[unroll]] for (int u = 0; u < WG_UNROLL_FACTOR; ++u) {
-            uint c = u*BLOCK_SIZE + col;
-            if (c < p.ncols) {
-                if (p.order == ASC) {
-                    data_d[row_offset + c] = tmp_idx[idx_offset + c].x;
-                } else {
-                    data_d[row_offset + p.ncols - c - 1] = tmp_idx[idx_offset + c].x;
-                }
-            }
-        }
-    }
-}
-
-void main() {
-    if (p.ncols == p.ncols_padded) {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(false, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
-    } else {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(true, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
-    }
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(ceil(x));
-}
@@ -1,67 +0,0 @@
-#version 450
-
-#include "types.glsl"
-#include "generic_unary_head.glsl"
-
-// workgroup does 32x32 tile, but uses 32x8 threads
-#define TILE_DIM 32
-layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;
-
-shared uint sh[TILE_DIM][TILE_DIM + 1];
-
-void iter(uvec3 wg_id) {
-    const uint tile_col = wg_id.x;
-    const uint tile_row = wg_id.y;
-
-    const uint tid_col = gl_LocalInvocationID.x;
-    const uint tid_row = gl_LocalInvocationID.y;
-
-    const uint i2 = wg_id.z % p.ne12;
-    const uint i3 = wg_id.z / p.ne12;
-    const uint i02 = i2;
-    const uint i03 = i3;
-
-    // The workgroup does TILE_DIM x TILE_DIM, but swaps the LSBs of the
-    // src coords to make memory accesses contiguous, dst has tid.x in i0,
-    // src has tid.x in i01
-
-    [[unroll]] for (uint y = 0; y < 4; ++y) {
-        const uint i00 = tile_col * TILE_DIM + tid_row + 8 * y;
-        const uint i01 = tile_row * TILE_DIM + tid_col;
-        if (i00 < p.ne00 && i01 < p.ne01 && i02 < p.ne02 && i03 < p.ne03) {
-            const uint src_idx = i00 * p.nb00 + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
-            sh[tid_row + 8 * y][tid_col] = uint(data_a[get_aoffset() + src_idx]);
-        }
-    }
-
-    barrier();
-
-    [[unroll]] for (uint y = 0; y < 4; ++y) {
-        const uint i0 = tile_col * TILE_DIM + tid_col;
-        const uint i1 = tile_row * TILE_DIM + tid_row + 8 * y;
-        if (i0 < p.ne10 && i1 < p.ne11 && i2 < p.ne12 && i3 < p.ne13) {
-            const uint dst_idx = i0 * p.nb10 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
-            // load transposed
-            data_d[get_doffset() + dst_idx] = D_TYPE(sh[tid_col][tid_row + 8 * y]);
-        }
-    }
-}
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-
-void main() {
-    uint z = gl_WorkGroupID.z;
-    uint y = gl_WorkGroupID.y;
-    bool need_barrier = false;
-    for (uint z = gl_WorkGroupID.z; z < p.ne12 * p.ne13; z += gl_NumWorkGroups.z) {
-        for (uint y = gl_WorkGroupID.y; y < CEIL_DIV(p.ne11, TILE_DIM); y += gl_NumWorkGroups.y) {
-            for (uint x = gl_WorkGroupID.x; x < CEIL_DIV(p.ne10, TILE_DIM); x += gl_NumWorkGroups.x) {
-                if (need_barrier) {
-                    barrier();
-                }
-                need_barrier = true;
-                iter(uvec3(x, y, z));
-            }
-        }
-    }
-}
@@ -1,19 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    // p.param1 = fill value
-    data_d[i] = D_TYPE(p.param1);
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(floor(x));
-}
@@ -1,29 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    float result;
-    // Round halfway cases away from zero as roundf does.
-    if (x >= 0.0) {
-        result = floor(x + 0.5);
-    } else {
-        result = ceil(x - 0.5);
-    }
-    data_d[i] = D_TYPE(result);
-}
@@ -1,23 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    const float result = (x > 20.0f) ? x : log(1.0f + exp(x));
-    data_d[i] = D_TYPE(result);
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f);
-}
@@ -1,22 +0,0 @@
-#version 450
-
-#include "generic_head.glsl"
-#include "types.glsl"
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float x = float(data_a[i]);
-    data_d[i] = D_TYPE(trunc(x));
-}
@@ -734,9 +734,6 @@ void process_shaders() {
    string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
    string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});

-    string_to_spv("cpy_transpose_16", "copy_transpose.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
-    string_to_spv("cpy_transpose_32", "copy_transpose.comp", {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}});
-
    for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
        string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
        string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
@@ -846,25 +843,6 @@ void process_shaders() {
    string_to_spv("abs_f16",        "abs.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("abs_f32",        "abs.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});

-    string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-
-    string_to_spv("add1_f16_f16",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
-    string_to_spv("add1_f16_f32",   "add1.comp",        {{"A_TYPE", "float16_t"},   {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
-    string_to_spv("add1_f32_f32",   "add1.comp",        {{"A_TYPE", "float"},       {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    string_to_spv("arange_f32",     "arange.comp",      {{"A_TYPE", "float"},       {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
-    string_to_spv("fill_f32",       "fill.comp",        {{"D_TYPE", "float"},       {"FLOAT_TYPE", "float"}});
-    string_to_spv("step_f16",       "step.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("step_f32",       "step.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("round_f16",      "round.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("round_f32",      "round.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("ceil_f16",       "ceil.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("ceil_f32",       "ceil.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("floor_f16",      "floor.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("floor_f32",      "floor.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-    string_to_spv("trunc_f16",      "trunc.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
-    string_to_spv("trunc_f32",      "trunc.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
-
    for (auto rte : {false, true}) {
        std::string suffix = rte ? "_rte" : "";
        string_to_spv("geglu_f16" + suffix,      "geglu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"},   {"RTE16", rte ? "1" : "0"}});
@@ -911,7 +889,6 @@ void process_shaders() {
    string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});

    string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
-    string_to_spv("argsort_large_f32", "argsort_large.comp", {{"A_TYPE", "float"}});

    string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
    string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
@@ -1,106 +0,0 @@
-[gMASK]<sop>
-{%- if tools -%}
-<|system|>
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{% for tool in tools %}
-{{ tool | tojson(ensure_ascii=False) }}
-{% endfor %}
-</tools>
-
-For each function call, output the function name and arguments within the following XML format:
-<tool_call>{function-name}
-<arg_key>{arg-key-1}</arg_key>
-<arg_value>{arg-value-1}</arg_value>
-<arg_key>{arg-key-2}</arg_key>
-<arg_value>{arg-value-2}</arg_value>
-...
-</tool_call>{%- endif -%}
-{%- macro visible_text(content) -%}
-    {%- if content is string -%}
-        {{- content }}
-    {%- elif content is iterable and content is not mapping -%}
-        {%- for item in content -%}
-            {%- if item is mapping and item.type == 'text' -%}
-                {{- item.text }}
-            {%- elif item is string -%}
-                {{- item }}
-            {%- endif -%}
-        {%- endfor -%}
-    {%- else -%}
-        {{- content }}
-    {%- endif -%}
-{%- endmacro -%}
-{%- set ns = namespace(last_user_index=-1) %}
-{%- for m in messages %}
-    {%- if m.role == 'user' %}
-        {% set ns.last_user_index = loop.index0 -%}
-    {%- endif %}
-{%- endfor %}
-{% for m in messages %}
-{%- if m.role == 'user' -%}<|user|>
-{{ visible_text(m.content) }}
-{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
-{%- elif m.role == 'assistant' -%}
-<|assistant|>
-{%- set reasoning_content = '' %}
-{%- set content = visible_text(m.content) %}
-{%- if m.reasoning_content is string %}
-    {%- set reasoning_content = m.reasoning_content %}
-{%- else %}
-    {%- if '</think>' in content %}
-        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-    {%- endif %}
-{%- endif %}
-{%- if loop.index0 > ns.last_user_index and reasoning_content -%}
-{{ '\n<think>' + reasoning_content.strip() +  '</think>'}}
-{%- else -%}
-{{ '\n<think></think>' }}
-{%- endif -%}
-{%- if content.strip() -%}
-{{ '\n' + content.strip() }}
-{%- endif -%}
-{% if m.tool_calls %}
-{% for tc in m.tool_calls %}
-{%- if tc.function %}
-    {%- set tc = tc.function %}
-{%- endif %}
-{{ '\n<tool_call>' + tc.name }}
-{% set _args = tc.arguments or {} %}
-{% if _args is not mapping %}
-    {{ raise_exception("Invalid tool call arguments passed: " + _args | string) }}
-{% endif %}
-{% for k, v in _args.items() %}
-<arg_key>{{ k }}</arg_key>
-<arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
-{% endfor %}
-</tool_call>{% endfor %}
-{% endif %}
-{%- elif m.role == 'tool' -%}
-{%- if m.content is string -%}
-{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-    {{- '<|observation|>' }}
-{%- endif %}
-{{- '\n<tool_response>\n' }}
-{{- m.content }}
-{{- '\n</tool_response>' }}
-{%- else -%}
-<|observation|>{% for tr in m.content %}
-
-<tool_response>
-{{ tr.output if tr.output is defined else tr }}
-</tool_response>{% endfor -%}
-{% endif -%}
-{%- elif m.role == 'system' -%}
-<|system|>
-{{ visible_text(m.content) }}
-{%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-    <|assistant|>{{- '\n<think></think>' if (enable_thinking is defined and not enable_thinking) else '' -}}
-{%- endif -%}
@@ -1,64 +0,0 @@
-{% macro render_content(msg) -%}
-    {%- set c = msg.get('content') -%}
-    {%- if c is string -%}
-      {{ c }}
-    {%- elif c is not none -%}
-      {% for content in c -%}
-        {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
-          <|media_start|>image<|media_content|><|media_pad|><|media_end|>
-        {% else -%}
-          {{ content['text'] }}
-        {%- endif -%}
-      {%- endfor -%}
-    {%- endif -%}
-{%- endmacro %}
-
-{%- set tool_response_queue = namespace(ids=[]) -%}
-{%- set tool_call_counter = namespace(value=1) -%}
-
-{%- if tools -%}
-  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
-{%- endif -%}
-{% for message in messages %}
-  {%- if loop.first and messages[0]['role'] != 'system' -%}
-  <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
-  {% endif %}
-
-  {%- set role_name =  message.get('name') or  message['role'] -%}
-  {%- if message['role'] == 'user' -%}
-    <|im_user|>{{role_name}}<|im_middle|>
-  {%- elif message['role'] == 'assistant' -%}
-    <|im_assistant|>{{role_name}}<|im_middle|>
-  {%- else -%}
-    <|im_system|>{{role_name}}<|im_middle|>
-  {%- endif -%}
-
-  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
-    {{render_content(message)}}<|tool_calls_section_begin|>
-    {%- for tool_call in message['tool_calls'] -%}
-      {%- if tool_call['id'] is defined -%}
-        {%- set formatted_id = tool_call['id'] -%}
-      {%- else -%}
-        {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
-        {%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
-      {%- endif -%}
-      {%- set _ = tool_response_queue.ids.append(formatted_id) -%}
-      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
-    {%- endfor -%}
-    <|tool_calls_section_end|>
-  {%- elif message['role'] == 'tool' -%}
-    {%- if tool_response_queue.ids -%}
-      {%- set tool_call_id = tool_response_queue.ids.pop(0) -%}
-    {%- else -%}
-      {%- set tool_call_id = 'functions.' + message.get('name', 'unknown') + ':' + (tool_call_counter.value | string) -%}
-    {%- endif -%}
-    ## Return of {{ tool_call_id }}
-{{render_content(message)}}
-  {%- elif message['content'] is not none -%}
-    {{render_content(message)}}
-  {%- endif -%}
-  <|im_end|>
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-  <|im_assistant|>assistant<|im_middle|>
-{%- endif -%}
@@ -1,112 +0,0 @@
-{%- macro render_content(msg) -%}
-    {%- set c = msg.get('content') -%}
-    {%- if c is string -%}
-      {{ c }}
-    {%- elif c is not none -%}
-      {% for content in c -%}
-        {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
-          <|media_start|>image<|media_content|><|media_pad|><|media_end|>
-        {% else -%}
-          {{ content['text'] }}
-        {%- endif -%}
-      {%- endfor -%}
-    {%- endif -%}
-{%- endmacro -%}
-
-{% macro set_roles(message) -%}
-  {%- set role_name =  message.get('name') or  message['role'] -%}
-  {%- if message['role'] == 'user' -%}
-    <|im_user|>{{role_name}}<|im_middle|>
-  {%- elif message['role'] == 'assistant' -%}
-    <|im_assistant|>{{role_name}}<|im_middle|>
-  {%- else -%}
-    <|im_system|>{{role_name}}<|im_middle|>
-  {%- endif -%}
-{%- endmacro -%}
-
-{%- set tool_response_queue = namespace(ids=[]) -%}
-{%- set tool_call_counter = namespace(value=1) -%}
-
-{%- macro render_toolcalls(message) -%}
-  <|tool_calls_section_begin|>
-  {%- for tool_call in message['tool_calls'] -%}
-    {%- if tool_call['id'] is defined -%}
-      {%- set formatted_id = tool_call['id'] -%}
-    {%- else -%}
-      {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
-      {%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
-    {%- endif -%}
-    {%- set _ = tool_response_queue.ids.append(formatted_id) -%}
-    <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
-  {%- endfor -%}
-  <|tool_calls_section_end|>
-{%- endmacro -%}
-
-
-{# Find last non-tool-call assisitant message #}
-{%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
-{%- for idx in range(messages|length-1, -1, -1) -%}
-    {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
-        {%- set ns.last_non_tool_call_assistant_msg = idx -%}
-    {%- endif -%}
-{%- endfor -%}
-
-{# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
-{%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
-{%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
-
-{%- if tools -%}
-  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
-{%- endif -%}
-
-{%- if messages|length == 0 or messages[0]['role'] != 'system' -%}
-  <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
-{%- endif -%}
-
-{%- for message in hist_msgs -%}
-  {{set_roles(message)}}
-  {%- if message['role'] == 'assistant' -%}
-    <think></think>{{render_content(message)}}
-    {%- if message.get('tool_calls') -%}
-      {{render_toolcalls(message)}}
-    {%- endif -%}
-  {%- elif message['role'] == 'tool' -%}
-    {%- if tool_response_queue.ids -%}
-      {%- set tool_call_id = tool_response_queue.ids.pop(0) -%}
-    {%- else -%}
-      {%- set tool_call_id = 'functions.' + message.get('name', 'unknown') + ':' + (tool_call_counter.value | string) -%}
-    {%- endif -%}
-    ## Return of {{ tool_call_id }}
-{{render_content(message)}}
-  {%- elif message['content'] is not none -%}
-    {{render_content(message)}}
-  {%- endif -%}
-  <|im_end|>
-{%- endfor -%}
-
-{%- for message in suffix_msgs -%}
-  {{set_roles(message)}}
-  {%- if message['role'] == 'assistant' -%}
-    {%- set rc = message.get('reasoning_content', '') -%}
-    <think>{{rc}}</think>{{render_content(message)}}
-    {%- if message.get('tool_calls') -%}
-     {{render_toolcalls(message)}}
-    {%- endif -%}
-  {%- elif message['role'] == 'tool' -%}
-    {%- if tool_response_queue.ids -%}
-      {%- set tool_call_id = tool_response_queue.ids.pop(0) -%}
-    {%- else -%}
-      {%- set tool_call_id = 'functions.' + message.get('name', 'unknown') + ':' + (tool_call_counter.value | string) -%}
-    {%- endif -%}
-    ## Return of {{ tool_call_id }}
-{{render_content(message)}}
-  {%- elif message['content'] is not none -%}
-    {{render_content(message)}}
-  {%- endif -%}
-  <|im_end|>
-{%- endfor -%}
-
-
-{%- if add_generation_prompt -%}
-  <|im_assistant|>assistant<|im_middle|>
-{%- endif -%}
@@ -1,54 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are MiMo, an AI assistant developed by Xiaomi.' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are MiMo, an AI assistant developed by Xiaomi.<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}
@@ -1,159 +0,0 @@
-{# ----------‑‑‑ special token variables ‑‑‑---------- #}
-{%- set toolcall_begin_token   = '<minimax:tool_call>'         -%}
-{%- set toolcall_end_token     = '</minimax:tool_call>'        -%}
-{#- Tool Rendering Functions ============================================== -#}
-{%- macro render_tool_namespace(namespace_name, tool_list) -%}
-{%- for tool in tool_list -%}
-<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
-{% endfor -%}
-{%- endmacro -%}
-{%- macro visible_text(content) -%}
-    {%- if content is string -%}
-        {{ content }}
-    {%- elif content is iterable and content is not mapping -%}
-        {%- for item in content -%}
-            {%- if item is mapping and item.type == 'text' -%}
-                {{- item.text }}
-            {%- elif item is string -%}
-                {{- item }}
-            {%- endif -%}
-        {%- endfor -%}
-    {%- else -%}
-        {{- content }}
-    {%- endif -%}
-{%- endmacro -%}
-{#- System Message Construction ============================================ -#}
-{%- macro build_system_message(system_message) -%}
-    {%- if system_message and system_message.content -%}
-        {{- visible_text(system_message.content) }}
-    {%- else -%}
-        {%- if model_identity is not defined -%}
-            {%- set model_identity = "You are a helpful assistant." -%}
-        {%- endif -%}
-        {{- model_identity }}
-    {%- endif -%}
-    
-    {#- Handle current_date -#}
-    {%- if system_message and system_message.current_date -%}
-        {{- '\n' ~ 'Current date: ' + system_message.current_date }}
-    {%- endif -%}
-    {#- Handle current_location -#}
-    {%- if system_message and system_message.current_location -%}
-        {{- '\n' ~ 'Current location: ' + system_message.current_location }}
-    {%- endif -%}
-{%- endmacro -%}
-{#- Main Template Logic ================================================= -#}
-{#- Extract system message (only first message if it's system) -#}
-{%- set system_message = none -%}
-{%- set conversation_messages = messages -%}
-{%- if messages and messages[0].role == "system" -%}
-    {%- set system_message = messages[0] -%}
-    {%- set conversation_messages = messages[1:] -%}
-{%- endif -%}
-{#- Get the last user message turn, for interleved thinking -#}
-{%- set ns = namespace(last_user_index=-1) %}
-{% for m in conversation_messages %}
-    {%- if m.role == 'user' %}
-        {% set ns.last_user_index = loop.index0 -%}
-    {%- endif %}
-{%- endfor %}
-{#- Render system message -#}
-{{- ']~!b[' ~ ']~b]system' ~ '\n' }}
-{{- build_system_message(system_message) }}
-{#- Render tools if available -#}
-{%- if tools -%}
-    {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
-    {{- '\n' ~ '<tools>' ~ '\n' }}
-    {{- render_tool_namespace("functions", tools) }}
-    {{- '</tools>' ~ '\n\n' }}
-{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
-{{- '\n' ~ toolcall_begin_token }}
-<invoke name="tool-name-1">
-<parameter name="param-key-1">param-value-1</parameter>
-<parameter name="param-key-2">param-value-2</parameter>
-...
-</invoke>
-{{- '\n' ~ toolcall_end_token }}
-{%- endif -%}
-{{- '[e~[\n' }}
-
-{#- Render messages -#}
-{%- set last_tool_call = namespace(name=none) -%}
-{%- for message in conversation_messages -%}
-    {%- if message.role == 'assistant' -%}
-        {#- Only render reasoning_content if no user message follows -#}
-        {{- ']~b]ai' ~ '\n' }}
-
-        {%- set reasoning_content = '' %}
-        {%- set content = visible_text(message.content) %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
-                {%- set content = content.split('</think>')[-1].strip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if reasoning_content and loop.index0 > ns.last_user_index -%}
-            {{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
-        {%- endif -%}
-        {%- if content -%}
-            {{- content }}
-        {%- endif -%}
-        {%- if message.tool_calls -%}
-            {{- '\n' ~ toolcall_begin_token ~ '\n' }}
-
-            {%- for tool_call in message.tool_calls -%}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<invoke name="' + tool_call.name + '">' }}
-                {% set _args = tool_call.arguments %}
-                {%- for k, v in _args.items() %}
-                {{- '<parameter name="' + k + '">' }}
-                {{- v | tojson(ensure_ascii=False) if v is not string else v }}
-                {{- '</parameter>' }}
-                {% endfor %}
-                {{- '</invoke>' ~ '\n' }}
-            {%- endfor -%}
-            
-            {{- toolcall_end_token}}
-            {%- set last_tool_call.name = message.tool_calls[-1].function.name -%}
-        {%- else -%}
-            {%- set last_tool_call.name = none -%}
-        {%- endif -%}
-        {{- '[e~[' ~ '\n' }}
-        
-    {%- elif message.role == 'tool' -%}
-    {%- if last_tool_call.name is none -%}
-        {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
-    {%- endif -%}
-    {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
-        {{- ']~b]tool' }}
-    {%- endif -%}
-    {%- if message.content is string -%}
-        {{- '\n<response>' }}
-        {{- message.content }}
-        {{- '</response>' }}
-    {%- else -%}
-        {%- for tr in message.content -%}
-            {{- '\n<response>' }}
-            {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
-            {{- '\n</response>' }}
-        {%- endfor -%}
-    {%- endif -%}
-    {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
-        {{- '[e~[\n' -}}
-    {%- endif -%}
-        
-    {%- elif message.role == 'user' -%}
-        {{- ']~b]user' ~ '\n' }}
-        {{- visible_text(message.content) }}
-        {{- '[e~[' ~ '\n' }}
-    {%- endif -%}
-{%- endfor -%}
-
-{#- Generation prompt -#}
-{%- if add_generation_prompt -%}
-{{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
-{%- endif -%}
@@ -1,117 +0,0 @@
-{% macro render_extra_keys(json_dict, handled_keys) %}
-    {%- if json_dict is mapping %}
-        {%- for json_key in json_dict if json_key not in handled_keys %}
-            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
-                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
-            {%- else %}
-                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
-            {%- endif %}
-        {%- endfor %}
-    {%- endif %}
-{% endmacro %}
-
-{%- if messages[0]["role"] == "system" %}
-    {%- set system_message = messages[0]["content"] %}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set loop_messages = messages %}
-{%- endif %}
-
-{%- if not tools is defined %}
-    {%- set tools = [] %}
-{%- endif %}
-
-{%- if system_message is defined %}
-    {{- "<|im_start|>system\n" + system_message }}
-{%- else %}
-    {%- if tools is iterable and tools | length > 0 %}
-        {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
-    {%- endif %}
-{%- endif %}
-{%- if tools is iterable and tools | length > 0 %}
-    {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
-    {{- "<tools>" }}
-    {%- for tool in tools %}
-        {%- if tool.function is defined %}
-            {%- set tool = tool.function %}
-        {%- endif %}
-        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
-        {%- if tool.description is defined %}
-            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
-        {%- endif %}
-        {{- '\n<parameters>' }}
-        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
-            {%- for param_name, param_fields in tool.parameters.properties|items %}
-                {{- '\n<parameter>' }}
-                {{- '\n<name>' ~ param_name ~ '</name>' }}
-                {%- if param_fields.type is defined %}
-                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
-                {%- endif %}
-                {%- if param_fields.description is defined %}
-                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
-                {%- endif %}
-                {%- set handled_keys = ['name', 'type', 'description'] %}
-                {{- render_extra_keys(param_fields, handled_keys) }}
-                {{- '\n</parameter>' }}
-            {%- endfor %}
-        {%- endif %}
-        {% set handled_keys = ['type', 'properties'] %}
-        {{- render_extra_keys(tool.parameters, handled_keys) }}
-        {{- '\n</parameters>' }}
-        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
-        {{- render_extra_keys(tool, handled_keys) }}
-        {{- '\n</function>' }}
-    {%- endfor %}
-    {{- "\n</tools>" }}
-    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
-{%- endif %}
-{%- if system_message is defined %}
-    {{- '<|im_end|>\n' }}
-{%- else %}
-    {%- if tools is iterable and tools | length > 0 %}
-        {{- '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in loop_messages %}
-    {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
-            {{- '\n' + message.content | trim + '\n' }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
-            {%- if tool_call.arguments is defined %}
-                {%- for args_name, args_value in tool_call.arguments|items %}
-                    {{- '<parameter=' + args_name + '>\n' }}
-                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
-                    {{- args_value }}
-                    {{- '\n</parameter>\n' }}
-                {%- endfor %}
-            {%- endif %}
-            {{- '</function>\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.previtem and loop.previtem.role != "tool" %}
-            {{- '<|im_start|>user\n' }}
-        {%- endif %}
-        {{- '<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>\n' }}
-        {%- if not loop.last and loop.nextitem.role != "tool" %}
-            {{- '<|im_end|>\n' }}
-        {%- elif loop.last %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- else %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}
@@ -1,126 +0,0 @@
-{# Unsloth template fixes #}
-{%- set available_tools_string = '' -%}
-{%- set thought_instructions = '' -%}
-{%- set add_tool_id = true -%}
-{%- set tool_output_format = "default" -%}
-{%- if tools is not none and tools|length > 0 -%}
-    {%- set available_tools_string -%}
-You are provided with function signatures within <available_tools></available_tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about the arguments. You should infer the argument values from previous user responses and the system message. Here are the available tools:
-<available_tools>
-{% for tool in tools %}
-{{ tool|string }}
-{% endfor %}
-</available_tools>
-{%- endset -%}
-{%- endif -%}
-{%- if tool_output_format is none or tool_output_format == "default" -%}
-{%- set tool_output_instructions -%}
-Return all function calls as a list of json objects within <tool_call></tool_call> XML tags. Each json object should contain a function name and arguments as follows:
-<tool_calls>[{"name": <function-name-1>, "arguments": <args-dict-1>}, {"name": <function-name-2>, "arguments": <args-dict-2>},...]</tool_calls>
-{%- endset -%}
-{%- elif tool_output_format == "yaml" -%}
-{%- set tool_output_instructions -%}
-Return all function calls as a list of yaml objects within <tool_call></tool_call> XML tags. Each yaml object should contain a function name and arguments as follows:
-<tool_calls>
- name: <function-name-1>
-  arguments: <args-dict-1>
- name: <function-name-2>
-  arguments: <args-dict-2>
-...
-</tool_calls>
-{%- endset -%}
-{%- endif -%}
-{%- if add_thoughts -%}
-{%- set thought_instructions -%}
-Prior to generating the function calls, you should generate the reasoning for why you're calling the function. Please generate these reasoning thoughts between <thinking> and </thinking> XML tags.
-{%- endset -%}
-{%- endif -%}
-{{- bos_token -}}
-{%- set reasoning_prompt='You are a thoughtful and systematic AI assistant built by ServiceNow Language Models (SLAM) lab. Before providing an answer, analyze the problem carefully and present your reasoning step by step. After explaining your thought process, provide the final solution in the following format: [BEGIN FINAL RESPONSE] ... [END FINAL RESPONSE].' -%}
-{%- if messages[0]['role'] != 'system' and tools is not none and tools|length > 0 -%}
-    {{- '<|system|>\n' + reasoning_prompt + available_tools_string + "\n" + tool_output_instructions + '\n<|end|>\n' -}}
-{%- endif -%}
-{%- if messages|selectattr('role', 'equalto', 'system')|list|length == 0 -%}
-{{- '<|system|>\n' + reasoning_prompt + '\n<|end|>\n' -}}
-{%- endif -%}
-{%- for message in messages -%}
-    {%- if message['role'] == 'user' -%}
-        {{- '<|user|>\n' }}
-        {%- if message['content'] is not string %}
-            {%- for chunk in message['content'] %}
-                {%- if chunk['type'] == 'text' %}
-                    {{- chunk['text'] }}
-                {%- elif chunk['type'] == 'image' or chunk['type'] == 'image_url'%}
-                    {{- '[IMG]' }}
-                {%- else %}
-                    {{- raise_exception('Unrecognized content type!') }}
-                {%- endif %}
-            {%- endfor %}
-        {%- else %}
-            {{- message['content'] }}
-        {%- endif %}
-        {{- '\n<|end|>\n' }}
-    {%- elif message['role'] == 'content' -%}
-        {%- if message['content'] is not string %}
-            {{- '<|content|>\n' + message['content'][0]['text'] + '\n<|end|>\n' -}}
-        {%- else %}
-            {{- '<|content|>\n' + message['content'] + '\n<|end|>\n' -}}
-        {%- endif -%}
-    {%- elif message['role'] == 'system' -%}
-        {%- if message['content'] is not none and message['content']|length > 0 %}
-            {%- if message['content'] is string %}
-                {%- set system_message = message['content'] %}
-            {%- else %}
-                {%- set system_message = message['content'][0]['text'] %}
-            {%- endif %}
-        {%- else %}
-            {%- set system_message = '' %}
-        {%- endif %}
-        {%- if tools is not none and tools|length > 0 -%}
-            {{- '<|system|>\n' + reasoning_prompt + system_message + '\n' + available_tools_string + '\n<|end|>\n' -}}
-        {%- else -%}
-            {{- '<|system|>\n' + reasoning_prompt + system_message + '\n<|end|>\n' -}}
-        {%- endif -%}
-    {%- elif message['role'] == 'assistant' -%}
-        {%- if loop.last -%}
-            {%- set add_tool_id = false -%}
-        {%- endif -%}
-        {{- '<|assistant|>\n' -}}
-        {%- if message['content'] is not none and message['content']|length > 0 -%}
-            {%- if message['content'] is not string and message['content'][0]['text'] is not none %}
-                {{- message['content'][0]['text'] }}
-            {%- else %}
-                {{- message['content'] -}}
-            {%- endif -%}
-        {%- elif message['chosen'] is not none and message['chosen']|length > 0 -%}
-            {{- message['chosen'][0] -}}
-        {%- endif -%}
-        {%- if add_thoughts and 'thought' in message and message['thought'] is not none -%}
-            {{- '<thinking>' + message['thought'] + '</thinking>' -}}
-        {%- endif -%}
-        {%- if message['tool_calls'] is not none and message['tool_calls']|length > 0 -%}
-            {{- '\n<tool_calls>[' -}}
-            {%- for tool_call in message["tool_calls"] -%}
-                {{- '{"name": "' + tool_call['function']['name'] + '", "arguments": ' + tool_call['function']['arguments']|string -}}
-                {%- if add_tool_id == true -%}
-                    {{- ', "id": "' + tool_call['id'] + '"' -}}
-                {%- endif -%}
-                {{- '}' -}}
-                {%- if not loop.last -%}{{- ', ' -}}{%- endif -%}
-            {%- endfor -%}
-            {{- ']</tool_calls>' -}}
-        {%- endif -%}
-        {{- '\n<|end|>\n' + eos_token -}}
-    {%- elif message['role'] == 'tool' -%}
-        {%- if message['content'] is string %}
-            {%- set tool_message = message['content'] %}
-        {%- else %}
-            {%- set tool_message = message['content'][0]['text'] %}
-        {%- endif -%}
-        {{- '<|tool_result|>\n' + tool_message|string + '\n<|end|>\n' -}}
-    {%- endif -%}
-    {%- if loop.last and add_generation_prompt and message['role'] != 'assistant' -%}
-        {{- '<|assistant|>\n' -}}
-    {%- endif -%}
-{%- endfor -%}
-{# Copyright 2025-present Unsloth. Apache 2.0 License. #}
@@ -1281,7 +1281,6 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {

        // Build suffix list in lexicographical order of reversed strings
        std::vector<std::string> suffixes;
-        suffixes.reserve(suffix_to_score.size() + 1);
        for (const auto & pair : suffix_to_score) {
            suffixes.push_back(pair.first);
        }
@@ -2776,34 +2776,24 @@ struct test_cpy : public test_case {
 struct test_cont : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
-    bool use_view_slice;

    std::string vars() override {
-        return VARS_TO_STR3(type, ne, use_view_slice);
+        return VARS_TO_STR2(type, ne);
    }

    test_cont(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            bool use_view_slice = false)
-        : type(type), ne(ne), use_view_slice(use_view_slice) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1})
+        : type(type), ne(ne) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_param(src);
        ggml_set_name(src, "src");

+        src = ggml_transpose(ctx, src);
+        ggml_set_name(src, "src_transposed");

-        ggml_tensor * dst;
-        if (use_view_slice) {
-            dst = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
-                src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
-            ggml_set_name(dst, "src_view_slice");
-        } else {
-            dst = ggml_transpose(ctx, src);
-            ggml_set_name(dst, "src_transposed");
-        }
-
-        ggml_tensor * out = ggml_cont(ctx, dst);
+        ggml_tensor * out = ggml_cont(ctx, src);
        ggml_set_name(out, "out");

        return out;
@@ -6955,17 +6945,16 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));

-    for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
-        for (bool use_view_slice : { true, false }) {
-            for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
-                {2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
-                if (use_view_slice && (type_dst == GGML_TYPE_F16 || type_dst == GGML_TYPE_BF16)) {
-                    continue; // TODO: add after WebGPU is fixed
-                }
-                test_cases.emplace_back(new test_cont(type_dst, ne, use_view_slice));
-            }
-        }
-    }
+    test_cases.emplace_back(new test_cont());
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));

    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
        for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
@@ -7026,7 +7015,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));

    test_cases.emplace_back(new test_add1());
-    test_cases.emplace_back(new test_add1(GGML_TYPE_F32, {1024, 1024, 1, 1}));
    test_cases.emplace_back(new test_scale());
    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f, true)); // inplace test
@@ -7366,13 +7354,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        test_cases.emplace_back(new test_clamp     (type, {7, 1, 5, 3}));
        test_cases.emplace_back(new test_leaky_relu(type, {7, 1, 5, 3}));
        test_cases.emplace_back(new test_floor     (type, {7, 1, 5, 3}));
-        test_cases.emplace_back(new test_floor     (type, { 1024, 1024, 1, 1 }));
        test_cases.emplace_back(new test_ceil      (type, {7, 1, 5, 3}));
-        test_cases.emplace_back(new test_ceil      (type, { 1024, 1024, 1, 1 }));
        test_cases.emplace_back(new test_round     (type, {7, 1, 5, 3}));
-        test_cases.emplace_back(new test_round     (type, { 1024, 1024, 1, 1 }));
        test_cases.emplace_back(new test_trunc     (type, {7, 1, 5, 3}));
-        test_cases.emplace_back(new test_trunc     (type, { 1024, 1024, 1, 1 }));
    }

    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
@@ -7517,15 +7501,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    }

    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
-        for (uint32_t i = 4; i <= 1024*1024; i *= 2) {
-            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1}));
-            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i, 1, 1, 1}));
-        }
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16384, 1, 1, 1}, order)); // many backends only handle up to 1024
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order));
@@ -7574,7 +7556,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
    test_cases.emplace_back(new test_roll());
    test_cases.emplace_back(new test_arange());
-    test_cases.emplace_back(new test_arange(GGML_TYPE_F32, 0.0f, 1048576.0f, 1.0f));
    test_cases.emplace_back(new test_timestep_embedding());
    test_cases.emplace_back(new test_leaky_relu());

@@ -7602,7 +7583,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_fill(0.0f));
    test_cases.emplace_back(new test_fill(2.0f, GGML_TYPE_F32, { 303, 207, 11, 3 }));
    test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
-    test_cases.emplace_back(new test_fill(3.5f, GGML_TYPE_F32, { 2048, 512, 2, 2 }));

    test_cases.emplace_back(new test_solve_tri());
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
@@ -13,9 +13,14 @@ endif()

 set(TARGET_SRCS
    server.cpp
-    utils.hpp
    server-http.cpp
    server-http.h
+    server-task.cpp
+    server-task.h
+    server-queue.cpp
+    server-queue.h
+    server-common.cpp
+    server-common.h
 )
 set(PUBLIC_ASSETS
    index.html.gz
@@ -0,0 +1,349 @@
+#pragma once
+
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "chat.h"
+#include "mtmd.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <string>
+#include <vector>
+#include <cinttypes>
+
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
+
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
+using json = nlohmann::ordered_json;
+
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+using raw_buffer = std::vector<uint8_t>;
+
+template <typename T>
+static T json_value(const json & body, const std::string & key, const T & default_value) {
+    // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()) {
+        try {
+            return body.at(key);
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
+            return default_value;
+        }
+    } else {
+        return default_value;
+    }
+}
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+    ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
+};
+
+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
+    server_grammar_trigger(const json & in) {
+        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token) in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out {
+            {"type", (int) value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int) value.token;
+        }
+        return out;
+    }
+};
+
+json format_error_response(const std::string & message, const enum error_type type);
+
+//
+// random string / id
+//
+
+std::string random_string();
+std::string gen_chatcmplid();
+std::string gen_tool_call_id();
+
+//
+// lora utils
+//
+
+// check whether the given lora set has only aloras activated (empty => false)
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
+
+// if the two sets of loras are different, they require a cache clear unless the
+// change is only from aloras to aloras.
+bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next);
+
+std::vector<common_adapter_lora_info> parse_lora_request(
+        const std::vector<common_adapter_lora_info> & lora_base,
+        const json & data);
+
+bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2);
+
+// get the ids of all enabled loras
+std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
+
+//
+// server_tokens
+//
+
+/**
+ * server_tokens is a helper to manage the input tokens and image for the server.
+ * it is made this way to simplify the logic of KV cache management.
+ */
+struct server_tokens {
+    bool has_mtmd = false;
+
+private: // disallow accessing these members directly, risking out-of-sync
+
+    // map a **start** index in tokens to the image chunk
+    // note: the order need to be in-sync with tokens
+    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
+
+    // list of tokens
+    //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
+    //   otherwise, it is a normal text token
+    // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
+    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
+    llama_tokens tokens;
+
+    // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
+    // idx  0   1   2   3   4   5      6      7      8      9      10
+    // pos  0   1   2   3   4   5      5      5      7      7      7
+    // map_idx_to_media will contain: {5, img0}, {8, img1}
+
+public:
+    server_tokens() = default;
+    ~server_tokens() = default;
+
+    // Prevent copying
+    // TODO: server_tokens should be copyable - remove this:
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
+
+    // Allow moving (usually implicitly generated if members are movable)
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+
+    // Allow accessing elements using [] operator
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
+
+    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
+    server_tokens(const llama_tokens & tokens, bool has_mtmd);
+
+    // for debugging
+    std::string str() const;
+
+    llama_pos pos_next() const;
+    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+
+    void push_back(llama_token tok);
+
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk * chunk);
+
+    // appends server tokens, updates the media map. copies media chunks.
+    void push_back(server_tokens & tokens);
+
+    // for compatibility with context shift and prompt truncation
+    void insert(const llama_tokens & inp_tokens);
+
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const;
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id);
+
+    size_t size() const { return tokens.size(); }
+
+    bool empty() const { return tokens.empty(); }
+
+    void clear() {
+        map_idx_to_media.clear();
+        tokens.clear();
+    }
+
+    void keep_first(size_t n);
+
+    std::string detokenize(const llama_context * ctx, bool special) const;
+
+    size_t get_common_prefix(const server_tokens & b) const;
+
+    // make sure all text tokens are within the vocab range
+    bool validate(const struct llama_context * ctx) const;
+
+    // encode and decode the image chunk
+    int32_t process_chunk(
+                llama_context * ctx,
+                mtmd_context * mctx,
+                size_t idx,
+                llama_pos pos,
+                int32_t seq_id,
+                size_t & n_tokens_out) const;
+};
+
+
+//
+// tokenizer and input processing utils
+//
+
+bool json_is_array_of_numbers(const json & data);
+
+// is array having BOTH numbers & strings?
+bool json_is_array_of_mixed_numbers_strings(const json & data);
+
+// does array have any individual integers/tokens?
+bool json_is_array_and_contains_numbers(const json & data);
+
+// get value by path(key1 / key2)
+json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
+
+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
+
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+size_t validate_utf8(const std::string& text);
+
+// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
+ */
+std::vector<server_tokens> tokenize_input_prompts(
+                                        const llama_vocab * vocab,
+                                        mtmd_context * mctx,
+                                        const json & json_prompt,
+                                        bool add_special,
+                                        bool parse_special);
+
+//
+// OAI utils
+//
+
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body);
+
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    std::map<std::string,std::string> chat_template_kwargs;
+    common_chat_templates * tmpls;
+    bool allow_image;
+    bool allow_audio;
+    bool enable_thinking = true;
+};
+
+// used by /chat/completions endpoint
+json oaicompat_chat_params_parse(
+    json & body, /* openai api json semantics */
+    const oaicompat_parser_options & opt,
+    std::vector<raw_buffer> & out_files);
+
+// TODO: move it to server-task.cpp
+json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
+
+// TODO: move it to server-task.cpp
+json format_response_rerank(
+        const json & request,
+        const json & ranks,
+        bool is_tei_format,
+        std::vector<std::string> & texts,
+        int top_n);
+
+//
+// other utils
+//
+
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
+
+std::string safe_json_to_str(const json & data);
+
+std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
+
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
+
+// format server-sent event (SSE), return the formatted string to send
+// note: if data is a json array, it will be sent as multiple events, one per item
+std::string format_sse(const json & data);
+
+bool is_valid_utf8(const std::string & str);
+
+//
+// formatting output responses
+// TODO: move these to server-task.cpp
+//
+
+llama_tokens format_prompt_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt);
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
+server_tokens format_prompt_rerank(
+        const struct llama_model * model,
+        const struct llama_vocab * vocab,
+        mtmd_context * mctx,
+        const std::string & query,
+        const std::string & doc);
@@ -1,6 +1,6 @@
-#include "utils.hpp"
 #include "common.h"
 #include "server-http.h"
+#include "server-common.h"

 #include <cpp-httplib/httplib.h>

@@ -0,0 +1,268 @@
+#include "server-task.h"
+#include "server-queue.h"
+
+#include "log.h"
+
+#include <chrono>
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define RES_INF(fmt, ...) LOG_INF("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_WRN(fmt, ...) LOG_WRN("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_ERR(fmt, ...) LOG_ERR("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_DBG(fmt, ...) LOG_DBG("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+//
+// server_queue
+//
+
+int server_queue::post(server_task && task, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    GGML_ASSERT(task.id != -1);
+    // if this is cancel task make sure to clean up pending tasks
+    if (task.type == SERVER_TASK_TYPE_CANCEL) {
+        cleanup_pending_task(task.id_target);
+    }
+    const int task_id = task.id;
+    QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
+    if (front) {
+        queue_tasks.push_front(std::move(task));
+    } else {
+        queue_tasks.push_back(std::move(task));
+    }
+    condition_tasks.notify_one();
+    return task_id;
+}
+
+int server_queue::post(std::vector<server_task> && tasks, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    for (auto & task : tasks) {
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
+        QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
+        if (front) {
+            queue_tasks.push_front(std::move(task));
+        } else {
+            queue_tasks.push_back(std::move(task));
+        }
+    }
+    condition_tasks.notify_one();
+    return 0;
+}
+
+void server_queue::defer(server_task && task) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    QUE_DBG("defer task, id = %d\n", task.id);
+    queue_tasks_deferred.push_back(std::move(task));
+    condition_tasks.notify_one();
+}
+
+int server_queue::get_new_id() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    int new_id = id++;
+    return new_id;
+}
+
+void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
+    callback_new_task = std::move(callback);
+}
+
+void server_queue::on_update_slots(std::function<void(void)> callback) {
+    callback_update_slots = std::move(callback);
+}
+
+void server_queue::pop_deferred_task() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!queue_tasks_deferred.empty()) {
+        queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
+        queue_tasks_deferred.pop_front();
+    }
+    condition_tasks.notify_one();
+}
+
+void server_queue::terminate() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    running = false;
+    condition_tasks.notify_all();
+}
+
+void server_queue::start_loop() {
+    running = true;
+
+    while (true) {
+        QUE_DBG("%s", "processing new tasks\n");
+
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
+                lock.unlock();
+                break;
+            }
+            server_task task = std::move(queue_tasks.front());
+            queue_tasks.pop_front();
+            lock.unlock();
+
+            QUE_DBG("processing task, id = %d\n", task.id);
+            callback_new_task(std::move(task));
+        }
+
+        // all tasks in the current loop is processed, slots data is now ready
+        QUE_DBG("%s", "update slots\n");
+
+        callback_update_slots();
+
+        QUE_DBG("%s", "waiting for new tasks\n");
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
+                condition_tasks.wait(lock, [&]{
+                    return (!queue_tasks.empty() || !running);
+                });
+            }
+        }
+    }
+}
+
+void server_queue::cleanup_pending_task(int id_target) {
+    // no need lock because this is called exclusively by post()
+    auto rm_func = [id_target](const server_task & task) {
+        return task.id == id_target;
+    };
+    queue_tasks.erase(
+        std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
+        queue_tasks.end());
+    queue_tasks_deferred.erase(
+        std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
+        queue_tasks_deferred.end());
+}
+
+//
+// server_response
+//
+
+void server_response::add_waiting_task_id(int id_task) {
+    RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.insert(id_task);
+}
+
+void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & task : tasks) {
+        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
+        waiting_task_ids.insert(task.id);
+    }
+}
+
+void server_response::remove_waiting_task_id(int id_task) {
+    RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.erase(id_task);
+    // make sure to clean up all pending results
+    queue_results.erase(
+        std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
+            return res->id == id_task;
+        }),
+        queue_results.end());
+}
+
+void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & id_task : id_tasks) {
+        RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+        waiting_task_ids.erase(id_task);
+    }
+}
+
+server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        condition_results.wait(lock, [&]{
+            if (!running) {
+                RES_DBG("%s : queue result stop\n", __func__);
+                std::terminate(); // we cannot return here since the caller is HTTP code
+            }
+            return !queue_results.empty();
+        });
+
+        for (size_t i = 0; i < queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (int i = 0; i < (int) queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+
+        std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+        if (!running) {
+            RES_DBG("%s : queue result stop\n", __func__);
+            std::terminate(); // we cannot return here since the caller is HTTP code
+        }
+        if (cr_res == std::cv_status::timeout) {
+            return nullptr;
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv(int id_task) {
+    std::unordered_set<int> id_tasks = {id_task};
+    return recv(id_tasks);
+}
+
+void server_response::send(server_task_result_ptr && result) {
+    RES_DBG("sending result for task id = %d\n", result->id);
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    for (const auto & id_task : waiting_task_ids) {
+        if (result->id == id_task) {
+            RES_DBG("task id = %d pushed to result queue\n", result->id);
+
+            queue_results.emplace_back(std::move(result));
+            condition_results.notify_all();
+            return;
+        }
+    }
+}
+
+void server_response::terminate() {
+    running = false;
+    condition_results.notify_all();
+}
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "server-task.h"
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <unordered_set>
+
+struct server_queue {
+private:
+    int id = 0;
+    bool running;
+
+    // queues
+    std::deque<server_task> queue_tasks;
+    std::deque<server_task> queue_tasks_deferred;
+
+    std::mutex mutex_tasks;
+    std::condition_variable condition_tasks;
+
+    // callback functions
+    std::function<void(server_task &&)> callback_new_task;
+    std::function<void(void)>           callback_update_slots;
+
+public:
+    // Add a new task to the end of the queue
+    int post(server_task && task, bool front = false);
+
+    // multi-task version of post()
+    int post(std::vector<server_task> && tasks, bool front = false);
+
+    // Add a new task, but defer until one slot is available
+    void defer(server_task && task);
+
+    // Get the next id for creating a new task
+    int get_new_id();
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task &&)> callback);
+
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback);
+
+    // Call when the state of one slot is changed, it will move one task from deferred to main queue
+    void pop_deferred_task();
+
+    // end the start_loop routine
+    void terminate();
+
+    /**
+     * Main loop consists of these steps:
+     * - Wait until a new task arrives
+     * - Process the task (i.e. maybe copy data into slot)
+     * - Check if multitask is finished
+     * - Update all slots
+     */
+    void start_loop();
+
+    // for metrics
+    size_t queue_tasks_deferred_size() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return queue_tasks_deferred.size();
+    }
+
+private:
+    void cleanup_pending_task(int id_target);
+};
+
+struct server_response {
+private:
+    bool running = true;
+
+    // for keeping track of all tasks waiting for the result
+    std::unordered_set<int> waiting_task_ids;
+
+    // the main result queue (using ptr for polymorphism)
+    std::vector<server_task_result_ptr> queue_results;
+
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+public:
+    // add the id_task to the list of tasks waiting for response
+    void add_waiting_task_id(int id_task);
+
+    void add_waiting_tasks(const std::vector<server_task> & tasks);
+
+    // when the request is finished, we can remove task associated with it
+    void remove_waiting_task_id(int id_task);
+
+    // remove multiple tasks from waiting list
+    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
+
+    // This function blocks the thread until there is a response for one of the id_tasks
+    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
+
+    // same as recv(), but have timeout in seconds
+    // if timeout is reached, nullptr is returned
+    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
+
+    // single-task version of recv()
+    server_task_result_ptr recv(int id_task);
+
+    // Send a new result to a waiting id_task
+    void send(server_task_result_ptr && result);
+
+    // terminate the waiting loop
+    void terminate();
+};
@@ -0,0 +1,453 @@
+#pragma once
+
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <unordered_set>
+#include <list>
+
+// TODO: prevent including the whole server-common.h as we only use server_tokens
+#include "server-common.h"
+
+using json = nlohmann::ordered_json;
+
+enum server_task_type {
+    SERVER_TASK_TYPE_COMPLETION,
+    SERVER_TASK_TYPE_EMBEDDING,
+    SERVER_TASK_TYPE_RERANK,
+    SERVER_TASK_TYPE_INFILL,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
+enum oaicompat_type {
+    OAICOMPAT_TYPE_NONE,
+    OAICOMPAT_TYPE_CHAT,
+    OAICOMPAT_TYPE_COMPLETION,
+    OAICOMPAT_TYPE_EMBEDDING,
+};
+
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+struct task_params {
+    bool stream          = true;
+    bool include_usage   = false;
+    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens   = false;
+    bool return_progress = false;
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<common_adapter_lora_info> lora;
+
+    std::vector<std::string> antiprompt;
+    std::vector<std::string> response_fields;
+    bool timings_per_token = false;
+    bool post_sampling_probs = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // OAI-compat fields
+    bool                         verbose                   = false;
+    oaicompat_type               oaicompat                 = OAICOMPAT_TYPE_NONE;
+    std::string                  oaicompat_model;
+    std::string                  oaicompat_cmpl_id;
+    common_chat_syntax           oaicompat_chat_syntax;
+
+    // Embeddings
+    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
+    json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
+    json to_json(bool only_metrics = false) const;
+};
+
+struct server_task {
+    int id    = -1; // to be filled by server_queue
+    int index = -1; // used when there are multiple prompts (batch request)
+
+    // used by SERVER_TASK_TYPE_CANCEL
+    int id_target = -1;
+    int id_slot   = -1;
+
+    // used by SERVER_TASK_TYPE_INFERENCE
+    task_params   params;
+    server_tokens tokens;
+
+    server_task_type type;
+
+    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
+    struct slot_action {
+        int slot_id;
+        std::string filename;
+        std::string filepath;
+    };
+    slot_action slot_action;
+
+    // used by SERVER_TASK_TYPE_METRICS
+    bool metrics_reset_bucket = false;
+
+    // used by SERVER_TASK_TYPE_SET_LORA
+    std::vector<common_adapter_lora_info> set_lora;
+
+    server_task() = default;
+
+    server_task(server_task_type type) : type(type) {}
+
+    int32_t n_tokens() const {
+        return tokens.size();
+    }
+
+    static task_params params_from_json_cmpl(
+            const llama_context * ctx,
+            const common_params & params_base,
+            const json & data);
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct result_timings {
+    int32_t cache_n = -1;
+
+    int32_t prompt_n = -1;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n = -1;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+
+    // Optional speculative metrics - only included when > 0
+    int32_t draft_n = 0;
+    int32_t draft_n_accepted = 0;
+
+    json to_json() const;
+};
+
+struct result_prompt_progress {
+    int32_t total = 0;
+    int32_t cache = 0;
+    int32_t processed = 0;
+    int64_t time_ms = 0;
+
+    json to_json() const;
+};
+
+struct server_task_result {
+    int id           = -1;
+    int id_slot      = -1;
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_*
+        return true;
+    }
+    virtual int get_index() {
+        return -1;
+    }
+    virtual json to_json() = 0;
+    virtual ~server_task_result() = default;
+};
+
+// using shared_ptr for polymorphism of server_task_result
+using server_task_result_ptr = std::unique_ptr<server_task_result>;
+
+struct completion_token_output {
+    llama_token tok;
+    float prob;
+    std::string text_to_send;
+    struct prob_info {
+        llama_token tok;
+        std::string txt;
+        float prob;
+    };
+    std::vector<prob_info> probs;
+
+    json to_json(bool post_sampling_probs) const;
+
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
+
+    static float logarithm(float x);
+
+    static std::vector<unsigned char> str_to_bytes(const std::string & str);
+
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    int index = 0;
+
+    std::string content;
+    llama_tokens tokens;
+
+    bool stream;
+    bool include_usage;
+    result_timings timings;
+    std::string prompt;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t n_tokens_cached;
+    bool has_new_line;
+    std::string stopping_word;
+    stop_type stop = STOP_TYPE_NONE;
+
+    bool post_sampling_probs;
+    std::vector<completion_token_output> probs_output;
+    std::vector<std::string>  response_fields;
+
+    task_params generation_params;
+
+    // OAI-compat fields
+    bool            verbose   = false;
+    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string     oaicompat_model;
+    std::string     oaicompat_cmpl_id;
+    common_chat_msg oaicompat_msg;
+
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return true; // in stream mode, final responses are considered stop
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+
+    json to_json_oaicompat_chat_stream();
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    int index = 0;
+
+    std::string  content;
+    llama_tokens tokens;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+
+    bool post_sampling_probs;
+    bool is_progress = false;
+    completion_token_output prob_output;
+    result_timings timings;
+    result_prompt_progress progress;
+
+    // OAI-compat fields
+    bool            verbose   = false;
+    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string     oaicompat_model;
+    std::string     oaicompat_cmpl_id;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return false; // in stream mode, partial responses are not considered stop
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+};
+
+struct server_task_result_embd : server_task_result {
+    int index = 0;
+    std::vector<std::vector<float>> embedding;
+
+    int32_t n_tokens;
+
+    // OAI-compat fields
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+};
+
+struct server_task_result_rerank : server_task_result {
+    int index = 0;
+    float score = -1e6;
+
+    int32_t n_tokens;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_error : server_task_result {
+    int index = 0;
+    error_type err_type = ERROR_TYPE_SERVER;
+    std::string err_msg;
+
+    // for ERROR_TYPE_EXCEED_CONTEXT_SIZE
+    int32_t n_prompt_tokens = 0;
+    int32_t n_ctx           = 0;
+
+    virtual bool is_error() override {
+        return true;
+    }
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_metrics : server_task_result {
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
+
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_tokens_max = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
+    // therefore, we use json to temporarily store the slot.to_json() result
+    json slots_data = json::array();
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_tokens;
+    size_t n_bytes;
+    double t_ms;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    size_t n_erased;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    virtual json to_json() override;
+};
+
+struct server_prompt_checkpoint {
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+};
+
+struct server_prompt {
+    server_tokens tokens;
+
+    std::vector<uint8_t> data;
+
+    std::list<server_prompt_checkpoint> checkpoints;
+
+    size_t size() const {
+        size_t res = data.size();
+
+        for (const auto & checkpoint : checkpoints) {
+            res += checkpoint.size();
+        }
+
+        return res;
+    }
+
+    int n_tokens() const {
+        return tokens.size();
+    }
+};
+
+struct server_prompt_cache {
+    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
+        this->limit_size   = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
+        this->limit_tokens = limit_tokens;
+    }
+
+    std::list<server_prompt> states;
+
+    // in bytes, 0 = no limit
+    size_t limit_size = 0;
+
+    // in tokens, 0 = no limit
+    size_t limit_tokens = 0;
+
+    size_t size() const;
+
+    size_t n_tokens() const;
+
+    server_prompt * alloc(const server_prompt & prompt, size_t state_size);
+
+    bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
+
+    void update();
+};
@@ -25,4 +25,3 @@ vite.config.ts.timestamp-*

 *storybook.log
 storybook-static
-*.code-workspace
@@ -2109,9 +2109,9 @@
 			}
 		},
 		"node_modules/@sveltejs/kit": {
-			"version": "2.48.5",
-			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.48.5.tgz",
-			"integrity": "sha512-/rnwfSWS3qwUSzvHynUTORF9xSJi7PCR9yXkxUOnRrNqyKmCmh3FPHH+E9BbgqxXfTevGXBqgnlh9kMb+9T5XA==",
+			"version": "2.48.4",
+			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.48.4.tgz",
+			"integrity": "sha512-TGFX1pZUt9qqY20Cv5NyYvy0iLWHf2jXi8s+eCGsig7jQMdwZWKUFMR6TbvFNhfDSUpc1sH/Y5EHv20g3HHA3g==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
@@ -5087,9 +5087,9 @@
 			"license": "MIT"
 		},
 		"node_modules/js-yaml": {
-			"version": "4.1.1",
-			"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
-			"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+			"integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
@@ -10,7 +10,6 @@
 		class?: string;
 		message: DatabaseMessage;
 		onCopy?: (message: DatabaseMessage) => void;
-		onContinueAssistantMessage?: (message: DatabaseMessage) => void;
 		onDelete?: (message: DatabaseMessage) => void;
 		onEditWithBranching?: (message: DatabaseMessage, newContent: string) => void;
 		onEditWithReplacement?: (
@@ -18,7 +17,6 @@
 			newContent: string,
 			shouldBranch: boolean
 		) => void;
-		onEditUserMessagePreserveResponses?: (message: DatabaseMessage, newContent: string) => void;
 		onNavigateToSibling?: (siblingId: string) => void;
 		onRegenerateWithBranching?: (message: DatabaseMessage) => void;
 		siblingInfo?: ChatMessageSiblingInfo | null;
@@ -28,11 +26,9 @@
 		class: className = '',
 		message,
 		onCopy,
-		onContinueAssistantMessage,
 		onDelete,
 		onEditWithBranching,
 		onEditWithReplacement,
-		onEditUserMessagePreserveResponses,
 		onNavigateToSibling,
 		onRegenerateWithBranching,
 		siblingInfo = null
@@ -137,33 +133,17 @@
 		onRegenerateWithBranching?.(message);
 	}

-	function handleContinue() {
-		onContinueAssistantMessage?.(message);
-	}
-
 	function handleSaveEdit() {
 		if (message.role === 'user') {
-			// For user messages, trim to avoid accidental whitespace
 			onEditWithBranching?.(message, editedContent.trim());
 		} else {
-			// For assistant messages, preserve exact content including trailing whitespace
-			// This is important for the Continue feature to work properly
-			onEditWithReplacement?.(message, editedContent, shouldBranchAfterEdit);
+			onEditWithReplacement?.(message, editedContent.trim(), shouldBranchAfterEdit);
 		}

 		isEditing = false;
 		shouldBranchAfterEdit = false;
 	}

-	function handleSaveEditOnly() {
-		if (message.role === 'user') {
-			// For user messages, trim to avoid accidental whitespace
-			onEditUserMessagePreserveResponses?.(message, editedContent.trim());
-		}
-
-		isEditing = false;
-	}
-
 	function handleShowDeleteDialogChange(show: boolean) {
 		showDeleteDialog = show;
 	}
@@ -186,7 +166,6 @@
 		onEditedContentChange={handleEditedContentChange}
 		{onNavigateToSibling}
 		onSaveEdit={handleSaveEdit}
-		onSaveEditOnly={handleSaveEditOnly}
 		onShowDeleteDialogChange={handleShowDeleteDialogChange}
 		{showDeleteDialog}
 		{siblingInfo}
@@ -202,7 +181,6 @@
 		messageContent={message.content}
 		onCancelEdit={handleCancelEdit}
 		onConfirmDelete={handleConfirmDelete}
-		onContinue={handleContinue}
 		onCopy={handleCopy}
 		onDelete={handleDelete}
 		onEdit={handleEdit}
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { Edit, Copy, RefreshCw, Trash2, ArrowRight } from '@lucide/svelte';
+	import { Edit, Copy, RefreshCw, Trash2 } from '@lucide/svelte';
 	import { ActionButton, ConfirmationDialog } from '$lib/components/app';
 	import ChatMessageBranchingControls from './ChatMessageBranchingControls.svelte';

@@ -18,7 +18,6 @@
 		onCopy: () => void;
 		onEdit?: () => void;
 		onRegenerate?: () => void;
-		onContinue?: () => void;
 		onDelete: () => void;
 		onConfirmDelete: () => void;
 		onNavigateToSibling?: (siblingId: string) => void;
@@ -32,7 +31,6 @@
 		onCopy,
 		onEdit,
 		onConfirmDelete,
-		onContinue,
 		onDelete,
 		onNavigateToSibling,
 		onShowDeleteDialogChange,
@@ -71,10 +69,6 @@
 				<ActionButton icon={RefreshCw} tooltip="Regenerate" onclick={onRegenerate} />
 			{/if}

-			{#if role === 'assistant' && onContinue}
-				<ActionButton icon={ArrowRight} tooltip="Continue" onclick={onContinue} />
-			{/if}
-
 			<ActionButton icon={Trash2} tooltip="Delete" onclick={onDelete} />
 		</div>
 	</div>
@@ -2,7 +2,6 @@
 	import { ChatMessageThinkingBlock, MarkdownContent } from '$lib/components/app';
 	import { useProcessingState } from '$lib/hooks/use-processing-state.svelte';
 	import { isLoading } from '$lib/stores/chat.svelte';
-	import autoResizeTextarea from '$lib/utils/autoresize-textarea';
 	import { fade } from 'svelte/transition';
 	import {
 		Check,
@@ -40,7 +39,6 @@
 		onCancelEdit?: () => void;
 		onCopy: () => void;
 		onConfirmDelete: () => void;
-		onContinue?: () => void;
 		onDelete: () => void;
 		onEdit?: () => void;
 		onEditKeydown?: (event: KeyboardEvent) => void;
@@ -67,7 +65,6 @@
 		messageContent,
 		onCancelEdit,
 		onConfirmDelete,
-		onContinue,
 		onCopy,
 		onDelete,
 		onEdit,
@@ -110,12 +107,6 @@
 		void copyToClipboard(model ?? '');
 	}

-	$effect(() => {
-		if (isEditing && textareaElement) {
-			autoResizeTextarea(textareaElement);
-		}
-	});
-
 	function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
 		const callNumber = index + 1;
 		const functionName = toolCall.function?.name?.trim();
@@ -199,10 +190,7 @@
 				bind:value={editedContent}
 				class="min-h-[50vh] w-full resize-y rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
 				onkeydown={onEditKeydown}
-				oninput={(e) => {
-					autoResizeTextarea(e.currentTarget);
-					onEditedContentChange?.(e.currentTarget.value);
-				}}
+				oninput={(e) => onEditedContentChange?.(e.currentTarget.value)}
 				placeholder="Edit assistant message..."
 			></textarea>

@@ -347,9 +335,6 @@
 			{onCopy}
 			{onEdit}
 			{onRegenerate}
-			onContinue={currentConfig.enableContinueGeneration && !thinkingContent
-				? onContinue
-				: undefined}
 			{onDelete}
 			{onConfirmDelete}
 			{onNavigateToSibling}
@@ -1,11 +1,10 @@
 <script lang="ts">
-	import { Check, X, Send } from '@lucide/svelte';
+	import { Check, X } from '@lucide/svelte';
 	import { Card } from '$lib/components/ui/card';
 	import { Button } from '$lib/components/ui/button';
 	import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
 	import { config } from '$lib/stores/settings.svelte';
-	import autoResizeTextarea from '$lib/utils/autoresize-textarea';
 	import ChatMessageActions from './ChatMessageActions.svelte';

 	interface Props {
@@ -23,7 +22,6 @@
 		} | null;
 		onCancelEdit: () => void;
 		onSaveEdit: () => void;
-		onSaveEditOnly?: () => void;
 		onEditKeydown: (event: KeyboardEvent) => void;
 		onEditedContentChange: (content: string) => void;
 		onCopy: () => void;
@@ -45,7 +43,6 @@
 		deletionInfo,
 		onCancelEdit,
 		onSaveEdit,
-		onSaveEditOnly,
 		onEditKeydown,
 		onEditedContentChange,
 		onCopy,
@@ -61,12 +58,6 @@
 	let messageElement: HTMLElement | undefined = $state();
 	const currentConfig = config();

-	$effect(() => {
-		if (isEditing && textareaElement) {
-			autoResizeTextarea(textareaElement);
-		}
-	});
-
 	$effect(() => {
 		if (!messageElement || !message.content.trim()) return;

@@ -104,34 +95,20 @@
 				bind:value={editedContent}
 				class="min-h-[60px] w-full resize-none rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
 				onkeydown={onEditKeydown}
-				oninput={(e) => {
-					autoResizeTextarea(e.currentTarget);
-					onEditedContentChange(e.currentTarget.value);
-				}}
+				oninput={(e) => onEditedContentChange(e.currentTarget.value)}
 				placeholder="Edit your message..."
 			></textarea>

 			<div class="mt-2 flex justify-end gap-2">
-				<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="ghost">
+				<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="outline">
 					<X class="mr-1 h-3 w-3" />
+
 					Cancel
 				</Button>

-				{#if onSaveEditOnly}
-					<Button
-						class="h-8 px-3"
-						onclick={onSaveEditOnly}
-						disabled={!editedContent.trim()}
-						size="sm"
-						variant="outline"
-					>
-						<Check class="mr-1 h-3 w-3" />
-						Save
-					</Button>
-				{/if}
-
 				<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
-					<Send class="mr-1 h-3 w-3" />
+					<Check class="mr-1 h-3 w-3" />
+
 					Send
 				</Button>
 			</div>
@@ -3,12 +3,10 @@
 	import { DatabaseStore } from '$lib/stores/database';
 	import {
 		activeConversation,
-		continueAssistantMessage,
 		deleteMessage,
-		editAssistantMessage,
-		editMessageWithBranching,
-		editUserMessagePreserveResponses,
 		navigateToSibling,
+		editMessageWithBranching,
+		editAssistantMessage,
 		regenerateMessageWithBranching
 	} from '$lib/stores/chat.svelte';
 	import { getMessageSiblings } from '$lib/utils/branching';
@@ -95,26 +93,6 @@

 		refreshAllMessages();
 	}
-
-	async function handleContinueAssistantMessage(message: DatabaseMessage) {
-		onUserAction?.();
-
-		await continueAssistantMessage(message.id);
-
-		refreshAllMessages();
-	}
-
-	async function handleEditUserMessagePreserveResponses(
-		message: DatabaseMessage,
-		newContent: string
-	) {
-		onUserAction?.();
-
-		await editUserMessagePreserveResponses(message.id, newContent);
-
-		refreshAllMessages();
-	}
-
 	async function handleDeleteMessage(message: DatabaseMessage) {
 		await deleteMessage(message.id);

@@ -132,9 +110,7 @@
 			onNavigateToSibling={handleNavigateToSibling}
 			onEditWithBranching={handleEditWithBranching}
 			onEditWithReplacement={handleEditWithReplacement}
-			onEditUserMessagePreserveResponses={handleEditUserMessagePreserveResponses}
 			onRegenerateWithBranching={handleRegenerateWithBranching}
-			onContinueAssistantMessage={handleContinueAssistantMessage}
 		/>
 	{/each}
 </div>
@@ -52,11 +52,6 @@
 						{ value: 'dark', label: 'Dark', icon: Moon }
 					]
 				},
-				{
-					key: 'pasteLongTextToFileLen',
-					label: 'Paste long text to file length',
-					type: 'input'
-				},
 				{
 					key: 'showMessageStats',
 					label: 'Show message generation statistics',
@@ -73,15 +68,14 @@
 					type: 'checkbox'
 				},
 				{
-					key: 'showModelInfo',
-					label: 'Show model information',
+					key: 'askForTitleConfirmation',
+					label: 'Ask for confirmation before changing conversation title',
 					type: 'checkbox'
 				},
 				{
-					key: 'enableContinueGeneration',
-					label: 'Enable "Continue" button',
-					type: 'checkbox',
-					isExperimental: true
+					key: 'pasteLongTextToFileLen',
+					label: 'Paste long text to file length',
+					type: 'input'
 				},
 				{
 					key: 'pdfAsImage',
@@ -89,13 +83,13 @@
 					type: 'checkbox'
 				},
 				{
-					key: 'renderUserContentAsMarkdown',
-					label: 'Render user content as Markdown',
+					key: 'showModelInfo',
+					label: 'Show model information',
 					type: 'checkbox'
 				},
 				{
-					key: 'askForTitleConfirmation',
-					label: 'Ask for confirmation before changing conversation title',
+					key: 'renderUserContentAsMarkdown',
+					label: 'Render user content as Markdown',
 					type: 'checkbox'
 				}
 			]
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { RotateCcw, FlaskConical } from '@lucide/svelte';
+	import { RotateCcw } from '@lucide/svelte';
 	import { Checkbox } from '$lib/components/ui/checkbox';
 	import { Input } from '$lib/components/ui/input';
 	import Label from '$lib/components/ui/label/label.svelte';
@@ -55,12 +55,8 @@
 			})()}

 			<div class="flex items-center gap-2">
-				<Label for={field.key} class="flex items-center gap-1.5 text-sm font-medium">
+				<Label for={field.key} class="text-sm font-medium">
 					{field.label}
-
-					{#if field.isExperimental}
-						<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
-					{/if}
 				</Label>
 				{#if isCustomRealTime}
 					<ParameterSourceIndicator />
@@ -101,12 +97,8 @@
 				</p>
 			{/if}
 		{:else if field.type === 'textarea'}
-			<Label for={field.key} class="block flex items-center gap-1.5 text-sm font-medium">
+			<Label for={field.key} class="block text-sm font-medium">
 				{field.label}
-
-				{#if field.isExperimental}
-					<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
-				{/if}
 			</Label>

 			<Textarea
@@ -137,12 +129,8 @@
 			})()}

 			<div class="flex items-center gap-2">
-				<Label for={field.key} class="flex items-center gap-1.5 text-sm font-medium">
+				<Label for={field.key} class="text-sm font-medium">
 					{field.label}
-
-					{#if field.isExperimental}
-						<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
-					{/if}
 				</Label>
 				{#if isCustomRealTime}
 					<ParameterSourceIndicator />
@@ -226,13 +214,9 @@
 						for={field.key}
 						class="cursor-pointer text-sm leading-none font-medium {isDisabled
 							? 'text-muted-foreground'
-							: ''} flex items-center gap-1.5"
+							: ''}"
 					>
 						{field.label}
-
-						{#if field.isExperimental}
-							<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
-						{/if}
 					</label>

 					{#if field.help || SETTING_CONFIG_INFO[field.key]}
@@ -38,8 +38,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	max_tokens: -1,
 	custom: '', // custom json-stringified object
 	// experimental features
-	pyInterpreterEnabled: false,
-	enableContinueGeneration: false
+	pyInterpreterEnabled: false
 };

 export const SETTING_CONFIG_INFO: Record<string, string> = {
@@ -97,7 +96,5 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	modelSelectorEnabled:
 		'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.',
 	pyInterpreterEnabled:
-		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
-	enableContinueGeneration:
-		'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
+		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.'
 };
@@ -312,6 +312,7 @@ export class ChatService {
 		let aggregatedContent = '';
 		let fullReasoningContent = '';
 		let aggregatedToolCalls: ApiChatCompletionToolCall[] = [];
+		let hasReceivedData = false;
 		let lastTimings: ChatMessageTimings | undefined;
 		let streamFinished = false;
 		let modelEmitted = false;
@@ -351,6 +352,8 @@ export class ChatService {
 				return;
 			}

+			hasReceivedData = true;
+
 			if (!abortSignal?.aborted) {
 				onToolCallChunk?.(serializedToolCalls);
 			}
@@ -412,6 +415,7 @@ export class ChatService {

 							if (content) {
 								finalizeOpenToolCallBatch();
+								hasReceivedData = true;
 								aggregatedContent += content;
 								if (!abortSignal?.aborted) {
 									onChunk?.(content);
@@ -420,6 +424,7 @@ export class ChatService {

 							if (reasoningContent) {
 								finalizeOpenToolCallBatch();
+								hasReceivedData = true;
 								fullReasoningContent += reasoningContent;
 								if (!abortSignal?.aborted) {
 									onReasoningChunk?.(reasoningContent);
@@ -441,6 +446,15 @@ export class ChatService {
 			if (streamFinished) {
 				finalizeOpenToolCallBatch();

+				if (
+					!hasReceivedData &&
+					aggregatedContent.length === 0 &&
+					aggregatedToolCalls.length === 0
+				) {
+					const noResponseError = new Error('No response received from server. Please try again.');
+					throw noResponseError;
+				}
+
 				const finalToolCalls =
 					aggregatedToolCalls.length > 0 ? JSON.stringify(aggregatedToolCalls) : undefined;

@@ -1486,10 +1486,6 @@ class ChatStore {
 					timestamp: Date.now()
 				});

-				// Ensure currNode points to the edited message to maintain correct path
-				await DatabaseStore.updateCurrentNode(this.activeConversation.id, messageToEdit.id);
-				this.activeConversation.currNode = messageToEdit.id;
-
 				this.updateMessageAtIndex(messageIndex, {
 					content: newContent,
 					timestamp: Date.now()
@@ -1503,69 +1499,6 @@ class ChatStore {
 		}
 	}

-	/**
-	 * Edits a user message and preserves all responses below
-	 * Updates the message content in-place without deleting or regenerating responses
-	 *
-	 * **Use Case**: When you want to fix a typo or rephrase a question without losing the assistant's response
-	 *
-	 * **Important Behavior:**
-	 * - Does NOT create a branch (unlike editMessageWithBranching)
-	 * - Does NOT regenerate assistant responses
-	 * - Only updates the user message content in the database
-	 * - Preserves the entire conversation tree below the edited message
-	 * - Updates conversation title if this is the first user message
-	 *
-	 * @param messageId - The ID of the user message to edit
-	 * @param newContent - The new content for the message
-	 */
-	async editUserMessagePreserveResponses(messageId: string, newContent: string): Promise<void> {
-		if (!this.activeConversation) return;
-
-		try {
-			const messageIndex = this.findMessageIndex(messageId);
-			if (messageIndex === -1) {
-				console.error('Message not found for editing');
-				return;
-			}
-
-			const messageToEdit = this.activeMessages[messageIndex];
-			if (messageToEdit.role !== 'user') {
-				console.error('Only user messages can be edited with this method');
-				return;
-			}
-
-			// Simply update the message content in-place
-			await DatabaseStore.updateMessage(messageId, {
-				content: newContent,
-				timestamp: Date.now()
-			});
-
-			this.updateMessageAtIndex(messageIndex, {
-				content: newContent,
-				timestamp: Date.now()
-			});
-
-			// Check if first user message for title update
-			const allMessages = await DatabaseStore.getConversationMessages(this.activeConversation.id);
-			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
-			const isFirstUserMessage =
-				rootMessage && messageToEdit.parent === rootMessage.id && messageToEdit.role === 'user';
-
-			if (isFirstUserMessage && newContent.trim()) {
-				await this.updateConversationTitleWithConfirmation(
-					this.activeConversation.id,
-					newContent.trim(),
-					this.titleUpdateConfirmationCallback
-				);
-			}
-
-			this.updateConversationTimestamp();
-		} catch (error) {
-			console.error('Failed to edit user message:', error);
-		}
-	}
-
 	/**
 	 * Edits a message by creating a new branch with the edited content
 	 * @param messageId - The ID of the message to edit
@@ -1763,200 +1696,6 @@ class ChatStore {
 		}
 	}

-	/**
-	 * Continues generation for an existing assistant message
-	 * @param messageId - The ID of the assistant message to continue
-	 */
-	async continueAssistantMessage(messageId: string): Promise<void> {
-		if (!this.activeConversation || this.isLoading) return;
-
-		try {
-			const messageIndex = this.findMessageIndex(messageId);
-			if (messageIndex === -1) {
-				console.error('Message not found for continuation');
-				return;
-			}
-
-			const messageToContinue = this.activeMessages[messageIndex];
-			if (messageToContinue.role !== 'assistant') {
-				console.error('Only assistant messages can be continued');
-				return;
-			}
-
-			// Race condition protection: Check if this specific conversation is already loading
-			// This prevents multiple rapid clicks on "Continue" from creating concurrent operations
-			if (this.isConversationLoading(this.activeConversation.id)) {
-				console.warn('Continuation already in progress for this conversation');
-				return;
-			}
-
-			this.errorDialogState = null;
-			this.setConversationLoading(this.activeConversation.id, true);
-			this.clearConversationStreaming(this.activeConversation.id);
-
-			// IMPORTANT: Fetch the latest content from the database to ensure we have
-			// the most up-to-date content, especially after a stopped generation
-			// This prevents issues where the in-memory state might be stale
-			const allMessages = await DatabaseStore.getConversationMessages(this.activeConversation.id);
-			const dbMessage = allMessages.find((m) => m.id === messageId);
-
-			if (!dbMessage) {
-				console.error('Message not found in database for continuation');
-				this.setConversationLoading(this.activeConversation.id, false);
-
-				return;
-			}
-
-			// Use content from database as the source of truth
-			const originalContent = dbMessage.content;
-			const originalThinking = dbMessage.thinking || '';
-
-			// Get conversation context up to (but not including) the message to continue
-			const conversationContext = this.activeMessages.slice(0, messageIndex);
-
-			const contextWithContinue = [
-				...conversationContext.map((msg) => {
-					if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
-						return msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
-					}
-					return msg as ApiChatMessageData;
-				}),
-				{
-					role: 'assistant' as const,
-					content: originalContent
-				}
-			];
-
-			let appendedContent = '';
-			let appendedThinking = '';
-			let hasReceivedContent = false;
-
-			await chatService.sendMessage(
-				contextWithContinue,
-				{
-					...this.getApiOptions(),
-
-					onChunk: (chunk: string) => {
-						hasReceivedContent = true;
-						appendedContent += chunk;
-						// Preserve originalContent exactly as-is, including any trailing whitespace
-						// The concatenation naturally preserves any whitespace at the end of originalContent
-						const fullContent = originalContent + appendedContent;
-
-						this.setConversationStreaming(
-							messageToContinue.convId,
-							fullContent,
-							messageToContinue.id
-						);
-
-						this.updateMessageAtIndex(messageIndex, {
-							content: fullContent
-						});
-					},
-
-					onReasoningChunk: (reasoningChunk: string) => {
-						hasReceivedContent = true;
-						appendedThinking += reasoningChunk;
-
-						const fullThinking = originalThinking + appendedThinking;
-
-						this.updateMessageAtIndex(messageIndex, {
-							thinking: fullThinking
-						});
-					},
-
-					onComplete: async (
-						finalContent?: string,
-						reasoningContent?: string,
-						timings?: ChatMessageTimings
-					) => {
-						const fullContent = originalContent + (finalContent || appendedContent);
-						const fullThinking = originalThinking + (reasoningContent || appendedThinking);
-
-						const updateData: {
-							content: string;
-							thinking: string;
-							timestamp: number;
-							timings?: ChatMessageTimings;
-						} = {
-							content: fullContent,
-							thinking: fullThinking,
-							timestamp: Date.now(),
-							timings: timings
-						};
-
-						await DatabaseStore.updateMessage(messageToContinue.id, updateData);
-
-						this.updateMessageAtIndex(messageIndex, updateData);
-
-						this.updateConversationTimestamp();
-
-						this.setConversationLoading(messageToContinue.convId, false);
-						this.clearConversationStreaming(messageToContinue.convId);
-						slotsService.clearConversationState(messageToContinue.convId);
-					},
-
-					onError: async (error: Error) => {
-						if (this.isAbortError(error)) {
-							// User cancelled - save partial continuation if any content was received
-							if (hasReceivedContent && appendedContent) {
-								const partialContent = originalContent + appendedContent;
-								const partialThinking = originalThinking + appendedThinking;
-
-								await DatabaseStore.updateMessage(messageToContinue.id, {
-									content: partialContent,
-									thinking: partialThinking,
-									timestamp: Date.now()
-								});
-
-								this.updateMessageAtIndex(messageIndex, {
-									content: partialContent,
-									thinking: partialThinking,
-									timestamp: Date.now()
-								});
-							}
-
-							this.setConversationLoading(messageToContinue.convId, false);
-							this.clearConversationStreaming(messageToContinue.convId);
-							slotsService.clearConversationState(messageToContinue.convId);
-
-							return;
-						}
-
-						// Non-abort error - rollback to original content
-						console.error('Continue generation error:', error);
-
-						// Rollback: Restore original content in UI
-						this.updateMessageAtIndex(messageIndex, {
-							content: originalContent,
-							thinking: originalThinking
-						});
-
-						// Ensure database has original content (in case of partial writes)
-						await DatabaseStore.updateMessage(messageToContinue.id, {
-							content: originalContent,
-							thinking: originalThinking
-						});
-
-						this.setConversationLoading(messageToContinue.convId, false);
-						this.clearConversationStreaming(messageToContinue.convId);
-						slotsService.clearConversationState(messageToContinue.convId);
-
-						const dialogType = error.name === 'TimeoutError' ? 'timeout' : 'server';
-						this.showErrorDialog(dialogType, error.message);
-					}
-				},
-				messageToContinue.convId
-			);
-		} catch (error) {
-			if (this.isAbortError(error)) return;
-			console.error('Failed to continue message:', error);
-			if (this.activeConversation) {
-				this.setConversationLoading(this.activeConversation.id, false);
-			}
-		}
-	}
-
 	/**
 	 * Public methods for accessing per-conversation states
 	 */
@@ -2004,11 +1743,8 @@ export const refreshActiveMessages = chatStore.refreshActiveMessages.bind(chatSt
 export const navigateToSibling = chatStore.navigateToSibling.bind(chatStore);
 export const editAssistantMessage = chatStore.editAssistantMessage.bind(chatStore);
 export const editMessageWithBranching = chatStore.editMessageWithBranching.bind(chatStore);
-export const editUserMessagePreserveResponses =
-	chatStore.editUserMessagePreserveResponses.bind(chatStore);
 export const regenerateMessageWithBranching =
 	chatStore.regenerateMessageWithBranching.bind(chatStore);
-export const continueAssistantMessage = chatStore.continueAssistantMessage.bind(chatStore);
 export const deleteMessage = chatStore.deleteMessage.bind(chatStore);
 export const getDeletionInfo = chatStore.getDeletionInfo.bind(chatStore);
 export const updateConversationName = chatStore.updateConversationName.bind(chatStore);
@@ -7,7 +7,6 @@ export interface SettingsFieldConfig {
 	key: string;
 	label: string;
 	type: 'input' | 'textarea' | 'checkbox' | 'select';
-	isExperimental?: boolean;
 	help?: string;
 	options?: Array<{ value: string; label: string; icon?: typeof import('@lucide/svelte').Icon }>;
 }
Author	SHA1	Message	Date
Georgi Gerganov	72f80499ee	server : headers cleanup	2025-11-24 12:50:50 +02:00
Xuan Son Nguyen	625010d42d	move enum stop_type to server-task	2025-11-18 17:28:21 +01:00
Xuan Son Nguyen	ca993bad51	rm redundant includes	2025-11-18 15:01:17 +01:00
Xuan Son Nguyen	3b7946034c	add server-queue	2025-11-18 14:24:46 +01:00
Xuan Son Nguyen	e1a756e934	add server-task, server-common	2025-11-18 14:15:14 +01:00