vulkan: fix l2_norm epsilon handling (#20350 )

vulkan: fix OOB check in flash_attn_mask_opt (#20296 )
vulkan: Fix ErrorOutOfHostMemory on Intel GPU when loading large models with --no-mmap (#20059 )
2026-07-01 01:57:43 +02:00 · 2026-03-12 06:39:41 +01:00 · 2026-03-12 06:35:49 +01:00 · 2026-03-12 06:30:16 +01:00 · 2026-03-11 22:03:27 -07:00 · 2026-03-11 22:03:07 -07:00
25 changed files with 1433 additions and 168 deletions
@@ -1354,6 +1354,77 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    return data;
 }

+static common_chat_params common_chat_params_init_gigachat_v3(
+        const common_chat_template & tmpl,
+        const autoparser::templates_params & inputs) {
+
+    common_chat_params data;
+
+    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = false;
+    data.preserved_tokens  = {
+        "<|message_sep|>\n\n",
+        "<|role_sep|>\n",
+    };
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+    auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+            // Build a choice of all available tools
+            auto tool_choice = p.choice();
+            for (const auto & tool : inputs.tools) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                const auto & schema = function.at("parameters");
+
+                auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
+                auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
+
+                auto tool_open = p.tool_open(p.literal("{") << tool_name);
+
+                tool_choice |= p.rule("tool-" + name, tool_open << "," << tool_args << "}");
+            }
+
+            // Define the tool call structure
+            auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+            auto max_calls = 1; // parallel toolcalls are not supported
+            auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
+            auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
+
+            return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
+        }
+
+        // Content only parser
+        include_grammar = false;
+        return p.content(p.rest());
+
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto schema = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, tool_call_start_prefix}
+        };
+    }
+    return data;
+}
+
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -1525,6 +1596,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        return common_chat_params_init_lfm2(tmpl, params);
    }

+    // GigaChatV3 format detection
+    if (src.find("<|role_sep|>") != std::string::npos &&
+        src.find("<|message_sep|>") != std::string::npos &&
+        src.find("<|function_call|>") == std::string::npos
+    ) {
+        LOG_DBG("Using specialized template: GigaChatV3\n");
+        return common_chat_params_init_gigachat_v3(tmpl, params);
+    }
+
    try {
        LOG_DBG("Using differential autoparser\n");
        struct autoparser::autoparser autoparser;
@@ -926,7 +926,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 // MoE utils
 //

-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";

 inline std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
@@ -5062,7 +5062,7 @@ class Phi2Model(TextModel):
        self.gguf_writer.add_add_bos_token(False)


-@ModelBase.register("Phi3ForCausalLM")
+@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
 class Phi3MiniModel(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI3

@@ -5237,6 +5237,129 @@ class Phi3MiniModel(TextModel):
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))

+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Phi4ForCausalLMV")
+class Phi4VisionMmprojModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+
+        self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
+        if self.vision_total_layers < 2:
+            raise ValueError(
+                f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
+            )
+
+        # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
+        # drop post-layernorm/head weights. This makes the GGUF runtime output match
+        # the feature map consumed by the patched siglip.cpp Phi-4 projector path.
+        self.vision_export_layers = self.vision_total_layers - 1
+        self.vision_last_layer_idx = self.vision_total_layers - 1
+
+        for key in self.n_block_keys:
+            if key in self.hparams_vision:
+                self.hparams_vision[key] = self.vision_export_layers
+                break
+
+        self.block_count = self.vision_export_layers
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
+
+        patch_size = self.preprocessor_config.get("patch_size")
+        if patch_size is None:
+            raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
+
+        self.hparams_vision["patch_size"] = patch_size
+
+        pos_emb_name = next(
+            (
+                name for name in self.model_tensors
+                if name.endswith("vision_model.embeddings.position_embedding.weight")
+            ),
+            None,
+        )
+        if pos_emb_name is None:
+            raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
+
+        pos_emb_shape = self.model_tensors[pos_emb_name]().shape
+        base_grid_tokens = int(pos_emb_shape[0])
+        grid_side = math.isqrt(base_grid_tokens)
+        if grid_side * grid_side != base_grid_tokens:
+            raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
+
+        self.hparams_vision["image_size"] = grid_side * patch_size
+
+        min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
+        max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
+        if min_num_patches is None or max_num_patches is None:
+            raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
+
+        self.min_pixels = int(min_num_patches) * patch_size * patch_size
+        self.max_pixels = int(max_num_patches) * patch_size * patch_size
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
+        self.gguf_writer.add_vision_min_pixels(self.min_pixels)
+        self.gguf_writer.add_vision_max_pixels(self.max_pixels)
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
+            if ".vision_model.head." in name:
+                return
+
+            new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
+
+            if ".vision_model.post_layernorm." in new_name:
+                return
+
+            if bid is not None and bid == self.vision_last_layer_idx:
+                return
+
+            if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
+                assert self.hparams_vision is not None
+                if data_torch.ndim != 2:
+                    raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
+
+                patch_area = self.hparams_vision["patch_size"] ** 2
+                in_features = data_torch.shape[1]
+                if in_features % patch_area != 0:
+                    raise ValueError(
+                        f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
+                    )
+
+                num_channels = in_features // patch_area
+                patch_size = self.hparams_vision["patch_size"]
+                data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
+                data_torch = data_torch.permute(0, 3, 1, 2)
+
+            yield from super().modify_tensors(data_torch, new_name, bid)
+            return
+
+        if name.startswith(("model.mm_projector.", "mm_projector.")):
+            local_name = name
+            local_name = local_name.replace("model.mm_projector.", "")
+            local_name = local_name.replace("mm_projector.", "")
+
+            if not (local_name.startswith("0.") or local_name.startswith("2.")):
+                return
+
+            suffix = ".bias" if local_name.endswith(".bias") else ".weight"
+            mm_idx = int(local_name.split(".", maxsplit=1)[0])
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
+            return
+
+        return
+

@ModelBase.register("PhiMoEForCausalLM")
 class PhiMoeModel(Phi3MiniModel):
@@ -11,6 +11,10 @@ endif()
 list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
 list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")

+if (NOT DEFINED CMAKE_HIP_FLAGS_DEBUG)
+    set(CMAKE_HIP_FLAGS_DEBUG "-g -O2")
+endif()
+
 # CMake on Windows doesn't support the HIP language yet
 if (WIN32)
    set(CXX_IS_HIPCC TRUE)
@@ -132,6 +132,7 @@ set(GGML_OPENCL_KERNELS
    ssm_conv
    sub
    sum_rows
+    cumsum
    transpose
    concat
    tsembd
@@ -547,6 +547,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_im2col_f32, kernel_im2col_f16;
    cl_kernel kernel_argsort_f32_i32;
    cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
+    cl_kernel kernel_cumsum_blk, kernel_cumsum_add;
    cl_kernel kernel_repeat_f32;
    cl_kernel kernel_pad;
    cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
@@ -1927,6 +1928,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
        GGML_LOG_CONT(".");
    }

+    // cumsum
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "cumsum.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("cumsum.cl");
+#endif
+        cl_program prog;
+        prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_cumsum_blk = clCreateKernel(prog, "kernel_cumsum_blk", &err), err));
+        CL_CHECK((backend_ctx->kernel_cumsum_add = clCreateKernel(prog, "kernel_cumsum_add", &err), err));
+        GGML_LOG_CONT(".");
+        CL_CHECK(clReleaseProgram(prog));
+    }
+
    // sigmoid
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3803,6 +3822,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
            return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
        }
        case GGML_OP_SUM_ROWS:
+        case GGML_OP_CUMSUM:
+            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
        case GGML_OP_MEAN:
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_FLASH_ATTN_EXT:
@@ -5775,19 +5796,12 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    GGML_ASSERT(dst);
    GGML_ASSERT(dst->extra);

-    const int      ne00 = src0->ne[0];
-    const cl_ulong nb01 = src0->nb[1];
-    const cl_ulong nb02 = src0->nb[2];
-    const cl_ulong nb03 = src0->nb[3];
-    const int      ne10 = src1->ne[0];
-    const cl_ulong nb10 = src1->nb[0];
-    const int      ne11 = src1->ne[1];
-    const int      ne12 = src1->ne[2];
-    const cl_ulong nb11 = src1->nb[1];
-    const cl_ulong nb12 = src1->nb[2];
-    const cl_ulong nb1  = dst->nb[1];
-    const cl_ulong nb2  = dst->nb[2];
-    const cl_ulong nb3  = dst->nb[3];
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+    GGML_TENSOR_LOCALS(int,      ne1, src1, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
+    GGML_TENSOR_LOCALS(int,      ne,  dst,  ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb,  dst,  nb);

    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

@@ -5833,8 +5847,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));

-    size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
-    size_t local_work_size[] = {64, 1, 1};
+    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= max_workgroup_size) {
+        nth *= 2;
+    }
+
+    size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};

    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
@@ -11949,6 +11969,118 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }

+static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_UNUSED(src1);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+
+    cl_kernel kernel = backend_ctx->kernel_cumsum_blk;
+
+    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= max_workgroup_size) {
+        nth *= 2;
+    }
+
+    GGML_ASSERT(ne00 <= nth*nth);
+
+    const int net0 = CEIL_DIV(ne00, nth);
+    const int net1 = ne01;
+    const int net2 = ne02;
+    const int net3 = ne03;
+
+    const cl_ulong nbt0 = sizeof(float);
+    const cl_ulong nbt1 = net0*nbt0;
+    const cl_ulong nbt2 = net1*nbt1;
+    const cl_ulong nbt3 = net2*nbt2;
+
+    static ggml_cl_buffer tmp_buffer;
+    tmp_buffer.allocate(backend_ctx->context, net0*ne01*ne02*ne03*sizeof(float));
+
+    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),   &tmp_buffer.buffer));
+    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,   7, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,   8, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel,   9, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  10, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel,  11, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel,  13, sizeof(int),      &net0));
+    CL_CHECK(clSetKernelArg(kernel,  14, sizeof(int),      &net1));
+    CL_CHECK(clSetKernelArg(kernel,  15, sizeof(int),      &net2));
+
+    size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = { (size_t)nth, 1, 1};
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+    if(ne00 > nth) {
+        // if a single workgroup cannot handle an entire row, each workgroup
+        // computes a partial sum and stores to dst, tmp_buffer contains the sum
+        // of the each workgroup; cumsum this buffer and add to the partial sums in dst
+        cl_ulong offsett = 0;
+        kernel = backend_ctx->kernel_cumsum_blk;
+        CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &tmp_buffer.buffer));
+        CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offsett));
+        CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),   &tmp_buffer.buffer));
+        CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_mem),   &tmp_buffer.buffer));
+        CL_CHECK(clSetKernelArg(kernel,   4, sizeof(cl_ulong), &offsett));
+        CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),      &net0));
+        CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,   7, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,   8, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel,   9, sizeof(cl_ulong), &nbt0));
+        CL_CHECK(clSetKernelArg(kernel,  10, sizeof(cl_ulong), &nbt1));
+        CL_CHECK(clSetKernelArg(kernel,  11, sizeof(cl_ulong), &nbt2));
+        CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_ulong), &nbt3));
+        CL_CHECK(clSetKernelArg(kernel,  13, sizeof(int),      &net0));
+        CL_CHECK(clSetKernelArg(kernel,  14, sizeof(int),      &net1));
+        CL_CHECK(clSetKernelArg(kernel,  15, sizeof(int),      &net2));
+
+        size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
+        size_t local_work_size_1[] = { (size_t)nth, 1, 1};
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
+
+        kernel = backend_ctx->kernel_cumsum_add;
+        CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &tmp_buffer.buffer));
+        CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,   3, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,   4, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel,   7, sizeof(int),      &nbt0));
+        CL_CHECK(clSetKernelArg(kernel,   8, sizeof(int),      &nbt1));
+        CL_CHECK(clSetKernelArg(kernel,   9, sizeof(int),      &nbt2));
+        CL_CHECK(clSetKernelArg(kernel,  10, sizeof(int),      &nbt3));
+
+        size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
+        size_t local_work_size_2[] = { (size_t)nth, 1, 1};
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
+    }
+}
+
 static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0);
    GGML_ASSERT(src0->extra);
@@ -12391,6 +12523,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            }
            func = ggml_cl_sum_rows;
            break;
+        case GGML_OP_CUMSUM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_cumsum;
+            break;
        case GGML_OP_FLASH_ATTN_EXT:
            if (!any_on_device) {
                return false;
@@ -0,0 +1,139 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+// max workgroup size is usually 1024, this covers various subgroups sizes
+#define MAX_SUBGROUPS 128
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_32
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_cumsum_blk(
+        global char * src0,
+        ulong offset0,
+        global char * tmp,
+        global char * dst,
+        ulong offsetd,
+        int   ne00,
+        int   ne01,
+        int   ne02,
+        int   ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        uint net0,
+        uint net1,
+        uint net2
+) {
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;
+
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int nth = get_local_size(0);
+    const int tid = get_local_id(0);
+
+    const uint sg_size = get_sub_group_size();
+    const uint sg_id = get_sub_group_id();
+    const uint sg_lid = get_sub_group_local_id();
+
+    const int ib = i1 / ne01;
+    const int i00 = ib * nth;
+    const int i01 = i1 % ne01;
+    const int i02 = i2;
+    const int i03 = i3;
+
+    global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    global       float * tmp_row  = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
+    global       float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    __local float partial[MAX_SUBGROUPS];
+
+    float v = 0.0f;
+    if (i00 + tid < ne00) {
+        v = src0_row[i00 + tid];
+    }
+
+    float s = sub_group_scan_inclusive_add(v);
+    if (sg_lid == sg_size - 1) {
+        partial[sg_id] = s;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // NB: subgroup size should be larger than number of subgroups
+    // assuming max workgroup size of 1024, subgroup size should be >= 32
+    if (sg_id == 0) {
+        float x = 0.0f;
+        if (sg_lid < get_num_sub_groups()) {
+            x = partial[sg_lid];
+        }
+        float ex = sub_group_scan_exclusive_add(x);
+        if (sg_lid < get_num_sub_groups()) {
+            partial[sg_lid] = ex;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    s += partial[sg_id];
+
+    if (i00 + tid < ne00) {
+        dst_row[i00 + tid] = s;
+    }
+    if (ne00 > nth && tid == nth - 1) {
+        tmp_row[ib] = s;
+    }
+}
+
+kernel void kernel_cumsum_add(
+        global char * tmp,
+        global char * dst,
+        ulong offsetd,
+        int   ne00,
+        int   ne01,
+        int   ne02,
+        int   ne03,
+        uint nbt0,
+        uint nbt1,
+        uint nbt2,
+        uint nbt3
+) {
+    dst  = dst + offsetd;
+
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    const int nth = get_local_size(0);
+    const int tid = get_local_id(0);
+
+    const int ib = i1 / ne01;
+    if (ib == 0) {
+        return;
+    }
+    const int i00 = ib * nth;
+    const int i01 = i1 % ne01;
+    const int i02 = i2;
+    const int i03 = i3;
+
+    global float * tmp_row  = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
+    global float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    if (i00 + tid < ne00) {
+        dst_row[i00 + tid] += tmp_row[ib - 1];
+    }
+}
@@ -27,6 +27,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #include <iostream>
 #include <tuple>
 #include <vector>
+#include <deque>
 #include <sstream>
 #include <utility>
 #include <memory>
@@ -188,6 +189,11 @@ struct ggml_backend_vk_buffer_type_context {

 struct vk_queue;

+struct vk_command_buffer {
+    vk::CommandBuffer buf;
+    bool in_use = false;
+};
+
 // Stores command pool/buffers. There's an instance of this
 // for each (context,queue) pair and for each (device,queue) pair.
 struct vk_command_pool {
@@ -195,10 +201,16 @@ struct vk_command_pool {
    void destroy(vk::Device& device);

    vk::CommandPool pool;
-    uint32_t cmd_buffer_idx;
-    std::vector<vk::CommandBuffer> cmd_buffers;
+    // Using deque so the pointers to command buffers
+    // remain valid even if we add more
+    std::deque<vk_command_buffer> cmd_buffers;

    vk_queue *q;
+
+    size_t buffers_in_use() const {
+        return std::count_if(cmd_buffers.begin(), cmd_buffers.end(),
+            [](const auto& cb) { return cb.in_use; });
+    }
 };

 // Prevent simultaneous submissions to the same queue.
@@ -878,10 +890,12 @@ struct vk_device_struct {
 };

 void vk_command_pool::init(vk_device& device, vk_queue *q_) {
-    cmd_buffer_idx = 0;
+    cmd_buffers.clear();
    q = q_;

-    vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
+    vk::CommandPoolCreateInfo command_pool_create_info(
+        vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT),
+        q->queue_family_index);
    pool = device->device.createCommandPool(command_pool_create_info);
 }

@@ -929,6 +943,7 @@ struct vk_subbuffer {
 struct vk_event {
    vk::Event event;
    vk::Fence fence;
+    vk_command_buffer* cmd_buffer = nullptr;
 };

 struct vk_semaphore {
@@ -937,7 +952,7 @@ struct vk_semaphore {
 };

 struct vk_submission {
-    vk::CommandBuffer buffer;
+    vk_command_buffer* buffer = nullptr;
    std::vector<vk_semaphore> wait_semaphores;
    std::vector<vk_semaphore> signal_semaphores;
 };
@@ -2283,25 +2298,15 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
    }
 }

-static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
+static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
    VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
-
-    if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
-        // Reuse command buffer
-        return p.cmd_buffers[p.cmd_buffer_idx++];
-    }
-
    vk::CommandBufferAllocateInfo command_buffer_alloc_info(
        p.pool,
        vk::CommandBufferLevel::ePrimary,
        1);
    const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
-    auto buf = cmd_buffers.front();
-
-    p.cmd_buffers.push_back(buf);
-    p.cmd_buffer_idx++;
-
-    return buf;
+    p.cmd_buffers.push_back({ cmd_buffers.front(), true });
+    return &p.cmd_buffers[p.cmd_buffers.size()-1];
 }

 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
@@ -2368,7 +2373,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
                tl_wait_semaphores[idx].data(),
                stage_flags[idx].data(),
                1,
-                &submission.buffer,
+                &submission.buffer->buf,
                (uint32_t) submission.signal_semaphores.size(),
                tl_signal_semaphores[idx].data(),
            };
@@ -2492,7 +2497,11 @@ static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p)

    // Requires command buffers to be done
    device->device.resetCommandPool(p.pool);
-    p.cmd_buffer_idx = 0;
+    // Don't clear the command buffers and mark them as not in use.
+    // This allows us to reuse them
+    for (auto& cmd_buffer : p.cmd_buffers) {
+        cmd_buffer.in_use = false;
+    }
 }

 static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
@@ -2501,10 +2510,10 @@ static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
    // Arbitrary frequency to cleanup/reuse command buffers
    static constexpr uint32_t cleanup_frequency = 10;

-    if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+    if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
        ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
    }
-    if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+    if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
        ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
    }
 }
@@ -2752,7 +2761,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
        ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
    }

-    subctx->s->buffer.pipelineBarrier(
+    subctx->s->buffer->buf.pipelineBarrier(
        subctx->p->q->stage_flags,
        subctx->p->q->stage_flags,
        {},
@@ -2768,7 +2777,7 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
 static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
    VK_LOG_DEBUG("ggml_vk_set_event()");

-    ctx->s->buffer.setEvent(
+    ctx->s->buffer->buf.setEvent(
        event,
        ctx->p->q->stage_flags
    );
@@ -2780,7 +2789,7 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
        return;
    }

-    ctx->s->buffer.waitEvents(
+    ctx->s->buffer->buf.waitEvents(
        events,
        ctx->p->q->stage_flags,
        ctx->p->q->stage_flags,
@@ -6348,13 +6357,24 @@ static vk_subbuffer ggml_vk_tensor_subbuffer(
    return vk_subbuffer{buffer, offset, size};
 }

+// Get a command buffer from pool. Create a new one if no reusable buffer is available
+static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
+    for (auto& cmd_buffer : pool.cmd_buffers) {
+        if (!cmd_buffer.in_use) {
+            cmd_buffer.in_use = true;
+            return &cmd_buffer;
+        }
+    }
+    return ggml_vk_create_cmd_buffer(device, pool);
+}
+
 static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
    vk_submission s;
-    s.buffer = ggml_vk_create_cmd_buffer(device, p);
+    s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p);
    if (one_time) {
-        s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
+        s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
    } else {
-        s.buffer.begin({ vk::CommandBufferUsageFlags{} });
+        s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} });
    }

    return s;
@@ -6407,18 +6427,18 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
    vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
    ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});

-    subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
-    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
-    subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+    subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
+    subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
+    subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
                                pipeline->layout,
                                0,
                                { descriptor_set },
                                {});
-    subctx->s->buffer.dispatch(wg0, wg1, wg2);
+    subctx->s->buffer->buf.dispatch(wg0, wg1, wg2);
 }

 static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
-    s.buffer.end();
+    s.buffer->buf.end();

    s.wait_semaphores = std::move(wait_semaphores);
    s.signal_semaphores = std::move(signal_semaphores);
@@ -6430,7 +6450,7 @@ static void ggml_vk_ctx_end(vk_context& ctx) {
        return;
    }

-    ctx->s->buffer.end();
+    ctx->s->buffer->buf.end();
    ctx->s = nullptr;
 }

@@ -6584,7 +6604,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
        }

        ggml_vk_sync_buffers(ctx, subctx);
-        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
+        subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
        return;
    }

@@ -6599,7 +6619,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
    VkBufferCopy buf_copy{ 0, offset, copy_size };

    ggml_vk_sync_buffers(ctx, subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);

    for (uint64_t i3 = 0; i3 < ne3; i3++) {
        for (uint64_t i2 = 0; i2 < ne2; i2++) {
@@ -6648,7 +6668,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
        }

        ggml_vk_sync_buffers(nullptr, subctx);
-        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
+        subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
        return true;
    }
    VK_LOG_DEBUG("STAGING");
@@ -6670,7 +6690,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
        copy_size};

    ggml_vk_sync_buffers(nullptr, subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);

    if (width == spitch) {
        deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
@@ -6756,7 +6776,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
    if (buf != nullptr) {
        // Memory is pinned, use as staging buffer
        ggml_vk_sync_buffers(nullptr, subctx);
-        subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
+        subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices);

        return true;
    }
@@ -6774,7 +6794,7 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
    vk_buffer& staging_buffer = src->device->sync_staging;

    ggml_vk_sync_buffers(nullptr, subctx);
-    subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
+    subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices);

    deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
    return true;
@@ -6821,7 +6841,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds

    VkBufferCopy bc{ src_offset, dst_offset, size };

-    vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
+    vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
 }

 static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
@@ -6859,7 +6879,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
    }

    // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
-    ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
+    ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
 }

 static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
@@ -6874,7 +6894,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
    std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
    vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
    ggml_vk_ctx_begin(dst->device, subctx);
-    subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
+    subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
    ggml_vk_ctx_end(subctx);

    ggml_vk_submit(subctx, dst->device->fence);
@@ -8820,7 +8840,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }

    // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
-    bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768;
+    bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16;
    vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
                                                                   mask != nullptr, use_mask_opt, logit_softcap != 0);

@@ -12682,7 +12702,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr

            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
                ctx->query_node_idx[ctx->query_idx] = node_idx;
-                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
            }
        }
        // Add all fused nodes to the unsynchronized lists.
@@ -13521,7 +13541,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
        buffer_cpy.dstOffset = dst_offset;
        buffer_cpy.size = size;

-        cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
+        cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
        deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
        ggml_vk_synchronize(ctx);
    }
@@ -13555,7 +13575,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
        buffer_cpy.dstOffset = 0;
        buffer_cpy.size = size;

-        compute_ctx->s->buffer.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
+        compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
        deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
        ggml_vk_synchronize(ctx);
    }
@@ -13633,8 +13653,12 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
    }

    vk_context compute_ctx;
+    vk_command_buffer* cmd_buf = nullptr;
    if (do_transfer) {
        compute_ctx = ctx->compute_ctx.lock();
+        if (compute_ctx->s) {
+            cmd_buf = compute_ctx->s->buffer;
+        }

        ggml_vk_ctx_end(compute_ctx);

@@ -13668,6 +13692,9 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
        }
        ggml_vk_wait_for_fence(ctx);
        ctx->submit_pending = false;
+        if (cmd_buf) {
+            cmd_buf->in_use = false;
+        }
    }

    if (do_transfer) {
@@ -14157,7 +14184,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        GGML_ASSERT(ctx->compute_ctx.expired());
        compute_ctx = ggml_vk_get_compute_ctx(ctx);
        ctx->query_idx = 0;
-        compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+        compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
    }

    ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -14393,7 +14420,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                // track a single node/fusion for the current query
                ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
                ctx->query_fusion_names[ctx->query_idx] = fusion_string;
-                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
            } else {
                // track a fusion string and number of fused ops for the current node_idx
                ctx->query_fusion_names[i] = fusion_string;
@@ -14726,6 +14753,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev
    ggml_vk_submit_transfer_ctx(ctx);

    vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx);
+    auto* cmd_buf = compute_ctx->s->buffer; // retrieve pointer before it gets reset

    // the backend interface doesn't have an explicit reset, so reset it here
    // before we record the command to set it
@@ -14738,6 +14766,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev

    ggml_vk_submit(compute_ctx, {vkev->fence});
    ctx->submit_pending = true;
+    vkev->cmd_buffer = cmd_buf;
    ctx->compute_ctx.reset();
 }

@@ -15557,6 +15586,10 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
    vk_event *vkev = (vk_event *)event->context;

    VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
+    // Finished using current command buffer so we flag for reuse
+    if (vkev->cmd_buffer) {
+        vkev->cmd_buffer->in_use = false;
+    }
 }

 static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
@@ -16028,7 +16061,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            tensor_clone = ggml_arange(ggml_ctx, start, stop, step);
        } else if (tensor->op == GGML_OP_FILL) {
            const float value = ggml_get_op_params_f32(tensor, 0);
-            tensor_clone = ggml_fill(ggml_ctx, tensor_clone, value);
+            tensor_clone = ggml_fill(ggml_ctx, src_clone[0], value);
        } else if (tensor->op == GGML_OP_SQR) {
            tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_SQRT) {
@@ -33,6 +33,61 @@ layout (push_constant) uniform parameter {
 shared float minsh[NUM_SUBGROUPS];
 shared float maxsh[NUM_SUBGROUPS];

+float FLT_MAX_OVER_2 = uintBitsToFloat(0x7EFFFFFF);
+
+void loadvec4(inout uint result, const uint i0, const uint i1, const uint i2, const uint i3, const bool need_bounds_check) {
+    const uint tid = gl_LocalInvocationIndex;
+
+    [[unroll]] for (uint block_x = 0; block_x < 16; ++block_x) {
+        float min_v = FLT_MAX_OVER_2;
+        float max_v = -FLT_MAX_OVER_2;
+        [[unroll]] for (uint i = 0; i < Br * Bc / 4; i += BLOCK_SIZE) {
+            uint j0 = (i + tid) % (Bc / 4);
+            uint j1 = (i + tid) / (Bc / 4);
+
+            j0 *= 4;
+            j0 += (i0 * 16 + block_x) * Bc;
+            j1 += i1 * Br;
+
+            if (!need_bounds_check || j0 + 3 < nem0) {
+                vec4 f = vec4(data_av4[(j0 + j1 * nbm1 + i2 * nbm2 + i3 * nbm3) / 4]);
+                [[unroll]] for (int c = 0; c < 4; ++c) {
+                    min_v = min(min_v, f[c]);
+                    max_v = max(max_v, f[c]);
+                }
+            } else {
+                [[unroll]] for (int c = 0; c < 4; ++c) {
+                    if (j0 + c < nem0) {
+                        float f = float(data_a[j0 + j1 * nbm1 + i2 * nbm2 + i3 * nbm3]);
+                        min_v = min(min_v, f);
+                        max_v = max(max_v, f);
+                    }
+                }
+            }
+        }
+        min_v = subgroupMin(min_v);
+        max_v = subgroupMax(max_v);
+        if (gl_SubgroupInvocationID == 0) {
+            minsh[gl_SubgroupID] = min_v;
+            maxsh[gl_SubgroupID] = max_v;
+        }
+        barrier();
+        if (tid == 0) {
+            [[unroll]] for (uint i = 0; i < NUM_SUBGROUPS; ++i) {
+                min_v = min(min_v, minsh[i]);
+                max_v = max(max_v, maxsh[i]);
+            }
+            if (max_v <= -FLT_MAX_OVER_2) {
+                result |= 1 << (2*block_x);
+            }
+            if (min_v == 0.0f && max_v == 0.0f) {
+                result |= 2 << (2*block_x);
+            }
+        }
+        barrier();
+    }
+}
+
 // For each Br x Bc block of the mask (input) buffer, read all values and check
 // if it's all -inf or all zero. Write out a two-bit code indicating which it is
 // (or zero for neither). Each workgroup processes 16 tiles and writes out a
@@ -48,50 +103,15 @@ void main() {
    const uint i2 = gl_WorkGroupID.z % nem2;
    const uint i3 = gl_WorkGroupID.z / nem2;

-    float FLT_MAX_OVER_2 = uintBitsToFloat(0x7EFFFFFF);
-
    uint result = 0;

    // Fast path for fully in-bounds blocks where we can do f16vec4 loads
    if ((nem0 % Bc) == 0 && (nem1 % Br) == 0 &&
        ((Br * Bc) % (BLOCK_SIZE * 4)) == 0) {
-        [[unroll]] for (uint block_x = 0; block_x < 16; ++block_x) {
-            float min_v = FLT_MAX_OVER_2;
-            float max_v = -FLT_MAX_OVER_2;
-            [[unroll]] for (uint i = 0; i < Br * Bc / 4; i += BLOCK_SIZE) {
-                uint j0 = (i + tid) % (Bc / 4);
-                uint j1 = (i + tid) / (Bc / 4);
-
-                j0 *= 4;
-                j0 += (i0 * 16 + block_x) * Bc;
-                j1 += i1 * Br;
-
-                vec4 f = vec4(data_av4[(j0 + j1 * nbm1 + i2 * nbm2 + i3 * nbm3) / 4]);
-                [[unroll]] for (int c = 0; c < 4; ++c) {
-                    min_v = min(min_v, f[c]);
-                    max_v = max(max_v, f[c]);
-                }
-            }
-            min_v = subgroupMin(min_v);
-            max_v = subgroupMax(max_v);
-            if (gl_SubgroupInvocationID == 0) {
-                minsh[gl_SubgroupID] = min_v;
-                maxsh[gl_SubgroupID] = max_v;
-            }
-            barrier();
-            if (tid == 0) {
-                [[unroll]] for (uint i = 0; i < NUM_SUBGROUPS; ++i) {
-                    min_v = min(min_v, minsh[i]);
-                    max_v = max(max_v, maxsh[i]);
-                }
-                if (max_v <= -FLT_MAX_OVER_2) {
-                    result |= 1 << (2*block_x);
-                }
-                if (min_v == 0.0f && max_v == 0.0f) {
-                    result |= 2 << (2*block_x);
-                }
-            }
-            barrier();
+        if ((i0 + 1) * 16 * Bc <= nem0) {
+            loadvec4(result, i0, i1, i2, i3, false);
+        } else {
+            loadvec4(result, i0, i1, i2, i3, true);
        }
    } else {
        [[unroll]] for (uint block_x = 0; block_x < 16; ++block_x) {
@@ -36,7 +36,7 @@ void main() {
        barrier();
    }

-    const FLOAT_TYPE scale = inversesqrt(max(sum[0], FLOAT_TYPE(p.param1)));
+    const FLOAT_TYPE scale = 1.0f / max(sqrt(sum[0]), FLOAT_TYPE(p.param1));

    [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
        data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]));
@@ -3881,6 +3881,7 @@ class VisionProjectorType:
    GEMMA3 = "gemma3"
    GEMMA3NV = "gemma3nv"
    GEMMA3NA = "gemma3na"
+    PHI4 = "phi4"
    IDEFICS3 = "idefics3"
    PIXTRAL = "pixtral"
    LLAMA4 = "llama4"
@@ -0,0 +1,355 @@
+{#--------TOOL RENDERING FUNCTIONS---------#}
+
+{#---------------------------------------------------------------
+  Converts JSON Schema (dict) to a TypeScript type definition
+----------------------------------------------------------------#}
+{%- macro json_schema_to_typescript(schema, indent="") -%}
+	{%- set ADDITIONAL_JSON_KEYS = ['format', 'maxItems', 'maximum', 'minItems', 'minimum', 'pattern'] -%}
+	{%- set ty = schema.get("type") -%}
+
+	{# ---------------- OBJECT ---------------- #}
+	{%- if ty == "object" -%}
+		{{- "{\n" -}}
+
+		{# Start building property list #}
+		{%- set props = schema.get("properties", {}) -%}
+		{%- set required = schema.get("required", []) -%}
+		{%- set has_additional_props = schema.get("additionalProperties") is defined -%}
+		{%- set additional_props_type = none -%}
+		{%- if has_additional_props -%}
+			{%- if schema.additionalProperties == true -%}
+				{%- set additional_props_type = {'type': 'any'} -%}
+			{%- elif schema.additionalProperties is mapping -%}
+				{%- set additional_props_type = schema.additionalProperties -%}
+			{%- endif -%}
+		{%- endif -%}
+
+		{%- for key, val in props.items() -%}
+			{# ---------- Description Comments ---------- #}
+			{%- if "description" in val -%}
+				{%- for line in val['description'].split('\n') -%}
+					{%- if line.strip() -%}
+						{{- indent + '// ' + line + '\n' -}}
+					{%- endif -%}
+				{%- endfor -%}
+			{%- endif -%}
+
+			{# ---------- Additional JSON Keys ---------- #}
+			{%- for add_key, add_val in val.items() -%}
+				{%- if add_key in ADDITIONAL_JSON_KEYS -%}
+					{%- if add_val is string -%}
+						{{- indent + '// ' + add_key + ': "' + add_val + '"' + '\n'  -}}
+					{%- else -%}
+						{{- indent + '// ' + add_key + ': ' ~ add_val ~ '\n' -}}
+					{%- endif -%}
+				{%- endif -%}
+			{%- endfor -%}
+
+			{# ---------- Property Definition ---------- #}
+            {%- set type_str = json_schema_to_typescript(
+                val, 
+                indent + "  "
+            ) -%}
+
+			{{- indent + key + ('' if key in required else '?') + ': ' + type_str + ',' -}}
+
+			{%- if "default" in val or "defalut_value" in val -%}
+				{%- set default = val.get("default", val.get("defalut_value")) -%}
+				{%- if default is string -%}
+					{{- ' // default: "' + default + '"' -}}
+				{%- else -%}
+					{{- ' // default: ' ~ default -}}
+				{%- endif -%}
+			{%- endif -%}
+
+			{{- "\n" -}}
+		{%- endfor -%}
+
+		{# Handle additionalProperties as index signature #}
+		{%- if has_additional_props and additional_props_type is not none -%}
+			{%- set additional_type_str = json_schema_to_typescript(
+				additional_props_type,
+				indent + "  "
+			) -%}
+			{{- indent + '[key: string]: ' + additional_type_str + '\n' -}}
+		{%- endif -%}
+
+		{{- indent[: (indent|length - "  "|length) ] + '}' -}}
+
+	{# ---------------- STRING ---------------- #}
+	{%- elif ty == "string" -%}
+		{%- if schema.get("enum") -%}
+            {%- set ns = namespace(enum = []) -%}
+            {%- for en in schema['enum'] -%}
+                {%- set ns.enum = ns.enum + ['"' ~ en ~ '"'] -%}
+            {%- endfor -%}
+			{{- ns.enum | join(' | ') -}}
+		{%- elif schema.get("format", "none") in ['date-time', 'date'] -%}
+			{{- 'Date' -}}
+		{%- else -%}
+			{{- 'string' -}}
+		{%- endif -%}
+
+	{# ---------------- NUMBER / INTEGER ---------------- #}
+	{%- elif ty in ["number", "integer"] -%}
+		{%- if schema.get("enum") -%}
+			{{- schema.enum | join(' | ') -}}
+		{%- else -%}
+			{{- 'number' -}}
+		{%- endif -%}
+
+	{# ---------------- BOOLEAN ---------------- #}
+	{%- elif ty == "boolean" -%}
+		{{- 'boolean' -}}
+
+	{# ---------------- ARRAY ---------------- #}
+	{%- elif ty == "array" -%}
+		{%- if "items" in schema -%}
+			{{- json_schema_to_typescript(schema['items'], indent) + '[]' -}}
+		{%- else -%}
+			{{- 'Array<any>' -}}
+		{%- endif -%}
+
+	{# ---------------- FALLBACK ---------------- #}
+	{%- else -%}
+		{{- 'any' -}}
+	{%- endif -%}
+{%- endmacro -%}
+
+{#---------------------------------------------------------------
+  Renders a namespace and its tool definitions in TypeScript style
+----------------------------------------------------------------#}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+	{%- set ns = namespace(sections = ['namespace ' ~ namespace_name ~ ' {']) -%}
+
+	{%- for tool in tools -%}
+		{%- if tool.function -%}
+			{%- set tool = tool.function -%}
+		{%- endif -%}
+
+		{%- set ns_tool = namespace(content_lines=[]) -%}
+
+		{# ---------- TOOL DESCRIPTION ---------- #}
+		{%- if tool.get('description') -%}
+			{%- for line in tool['description'].split('\n') -%}
+				{%- if line.strip() -%}
+					{%- set ns_tool.content_lines = ns_tool.content_lines + ['// ' ~ line] -%}
+				{%- endif -%}
+			{%- endfor -%}
+		{%- endif -%}
+
+		{# ---------- TOOL SIGNATURE ---------- #}
+		{%- set main_body = "" -%}
+		{%- set params = tool.get("parameters") -%}
+		{%- if params and params.get("properties") -%}
+			{%- set param_type = json_schema_to_typescript(params, "  ") -%}
+			{%- set main_body = 'type ' ~ tool.name ~ ' = (_: ' ~ param_type ~ ') => ' -%}
+		{%- else -%}
+			{%- set main_body = 'type ' ~ tool.name ~ ' = () => ' -%}
+		{%- endif -%}
+
+		{# ---------- RETURN TYPE ---------- #}
+		{%- set return_params = tool.get("return_parameters") -%}
+		{%- if return_params and return_params.get("properties") -%}
+			{%- set return_type = json_schema_to_typescript(return_params, "  ") -%}
+			{%- set main_body = main_body ~ return_type -%}
+		{%- else -%}
+			{%- set main_body = main_body ~ 'any' -%}
+		{%- endif -%}
+
+		{%- set main_body = main_body ~ ';\n' -%}
+
+		{%- set ns_tool.content_lines = ns_tool.content_lines + [main_body] -%}
+
+		{# ---------- ADD TOOL TO SECTIONS ---------- #}
+		{%- set ns.sections = ns.sections + [ns_tool.content_lines | join('\n')] -%}
+	{%- endfor -%}
+
+	{%- set ns.sections = ns.sections + ['} // namespace ' ~ namespace_name] -%}
+
+	{{- ns.sections | join('\n') -}}
+{%- endmacro -%}
+
+
+{# ----------- MESSAGE RENDERING HELPER FUNCTIONS ------------ #}
+
+{%- macro render_role_message(message, role=None) -%}
+    {%- if not role -%}
+        {%- set role = message["role"] -%}
+    {%- endif -%}
+
+    {%- set message_content = message['content'] or '' -%}
+    {%- if message_content is not string -%}
+        {%- set message_content = message_content | tojson(ensure_ascii=False) -%}
+    {%- endif -%}
+
+    {{- role + add_tokens.role_sep + message_content + add_tokens.message_sep -}}
+
+{%- endmacro -%}
+
+
+{%- macro render_function_call(message) -%}
+    {%- set call = message['content'] -%}
+    {%- if call.function -%}
+        {%- set call = call.function -%}
+    {%- endif -%}
+
+    {%- set arguments = call['arguments'] -%}
+	{%- if arguments is not string -%}
+		{%- set arguments = arguments| tojson(ensure_ascii=False) -%}
+	{%- endif -%}
+
+    {{- render_role_message(
+        {
+            'role': 'function call',
+            'content': '{"name": "' ~ call['name'] ~ '", "arguments": ' ~ arguments ~ '}'
+        }
+    ) -}}
+{%- endmacro -%}
+
+{# ----- SPECIAL TOKENS ----- #}
+
+{%- set add_tokens = namespace(
+    role_sep="<|role_sep|>\n", 
+    message_sep="<|message_sep|>\n\n"
+) -%}
+
+{# ----- DEFAULT DEVSYSTEM ----- #}
+
+{%- set DEVSYSTEM -%}
+<role_description>
+Description of the roles available in the dialog.
+
+`developer system`
+A message added by Sber before the main dialog. It has the highest priority and sets global, non-overridable conditions (for example, conversation rules, the safety policy, the assistant's overall response style, etc.).
+
+`system`
+A system instruction added by developers or by the user, but with a lower priority than `developer system`. It usually describes the assistant's instructions, a specific response style, and other conditions for this particular dialog.
+
+`user`
+A message or request from the user. The assistant follows it if it does not conflict with higher-priority instructions (see <instruction_priority>).
+
+`user memory`
+A sequence of the most up-to-date long-term facts about the user at the time of their request, presented as a JSON list of strings. Facts are listed in chronological order, meaning newer facts are appended to the end of the sequence. When facts are changed or deleted, records of previous facts remain in the sequence. The assistant saves facts using a function and uses them in accordance with the <memory_guidelines> block below.
+
+`added files`
+Metadata about files available for use in the dialog, presented in JSON format. It contains the following keys: id (a unique file identifier), name (file name), type (file type).
+
+`assistant`
+The assistant's reply to the user's request. If the system instruction or the user does not set additional rules for `assistant`, this reply must comply with the instructions in the <assistant_guidelines> block below. The list of functions available to call is contained in `function descriptions`. The name of the required function and its arguments will be generated next by the `function call` role. In its replies, the assistant follows the instructions in accordance with <instruction_priority>.
+
+`function descriptions`
+Function descriptions in TypeScript format. A function is a special tool (or a set of instructions) that the assistant can call to perform specific actions, computations, or obtain data needed to solve the user's task. Each function description contains blocks with the name, description, and arguments. Sometimes the description contains separate blocks with return parameters and usage examples that illustrate the correct call and arguments.
+
+`function call`
+The function that `assistant` calls based on the dialog context, and its arguments. The function is invoked in strict accordance with the instructions in the <function_usage> block.
+
+`function result`
+The result of the last function call.
+</role_description>
+
+<available_modalities>
+The assistant can work with the following modalities: text, available functions.
+</available_modalities>
+
+<instruction_priority>
+If instructions from different roles conflict within the dialog context, observe the following priorities:  
+`developer system` > `system` > `user` > `function descriptions` > `function result` > `user memory`
+</instruction_priority>
+
+<function_usage>
+Basic instructions for working with functions.
+
+Only call those functions that are described in `function descriptions`.
+
+Call available functions when, according to their description, such a call will help provide a more complete and/or accurate answer to the user's request. Fill in function arguments using information from the dialog context. If a function could help answer the request but a required argument is missing from the context, ask the user for the missing data before calling the function. If a necessary function is unavailable or an error occurs, briefly inform the user and, if possible, suggest an alternative.
+</function_usage>
+
+<memory_guidelines>
+Rules for using facts in long-term memory:
+
+If there is no message under the `user memory` role in the dialog, this is equivalent to the absence of long-term facts about the user in memory. In that case, information about the user is limited to the current dialog, and no new facts should be saved.
+</memory_guidelines>
+
+<assistant_guidelines>
+You are a helpful assistant.
+
+# Instructions
+- Strictly follow the instruction priority.
+- Maintain a logical chain of reasoning when answering the user's question.
+- For complex questions (for example, STEM), try to answer in detail unless the system message or dialog context limits the response length.
+- Be helpful, truthful, and avoid unsafe or prohibited content in your responses.
+- Try to reply in the language in which the user asked their question.
+</assistant_guidelines>
+
+A dialog will follow below.
+The dialog may include various roles described in the <role_description> block.
+Each turn begins with the role name and a special token that marks the end of the role's full name, and ends with a special end-of-turn token.
+Your task is to continue the dialog from the last specified role in accordance with the dialog context.
+{%- endset -%}
+
+
+{#- ---------------------- RENDERING STARTS HERE ---------------------- -#}
+
+
+{# ----- RENDER BOS TOKEN ----- #}
+{{- bos_token -}}
+
+
+{# ----- RENDER DEVSYSTEM ----- #}
+{{- render_role_message({"role": "developer system", "content": DEVSYSTEM}) -}}
+
+{# ----- RENDER SYSTEM IF PRESENT ----- #}
+{%- if messages and messages[0]['role'] == 'system' -%}
+    {{- render_role_message(messages[0]) -}}
+    {%- set messages = messages[1:] -%}
+{%- endif -%}
+
+{# ----- RENDER TOOLS ----- #}
+{%- if tools -%}
+    {%- set tools_content = (
+        render_tool_namespace('functions', tools) 
+        + "\n\n"
+    ) -%}
+    {{- render_role_message({'role': 'function descriptions', 'content': tools_content}) -}}
+{%- endif -%}
+
+{# ----- MAIN MESSAGE LOOP ----- #}
+{%- for message in messages -%}
+
+    {# ----- TOOL MESSAGE -------#}
+    {%- if message['role'] == 'tool' -%}
+        {{- render_role_message(message, 'function result') -}}
+
+
+    {# ----- ASSISTANT MESSAGE ----- #}
+    {%- elif message['role'] == 'assistant' -%}
+
+        {# ----- FUNCTION CALL PART CHECKING: SINGLE CALL SETUP ----- #}
+        {%- if message.tool_calls is defined and message.tool_calls -%}
+            {%- set function_call = message.tool_calls[0] -%}
+        {%- else -%}
+            {%- set function_call = None -%}
+        {%- endif -%}
+
+        {# ----- MAIN ASSISTANT RENDERING ----- #}
+
+        {{- render_role_message({'role': 'assistant', 'content': message.content}) -}}
+        {%- if function_call -%}
+            {{- render_function_call({'role': 'function call', 'content': function_call}) -}}
+        {%- endif -%}
+
+        
+    {# ----- OTHER MESSAGES ----- #}
+    {%- else -%}
+        {{- render_role_message(message) -}}
+    {%- endif -%}
+
+    {# ----- ADDING GENERATION PROMPT ----- #}
+
+    {%- if loop.last and add_generation_prompt and message['role'] != 'assistant' -%}
+        {{- 'assistant' + add_tokens.role_sep -}}
+    {%- endif -%}
+
+{%- endfor -%}
@@ -0,0 +1,339 @@
+{#--------TOOL RENDERING FUNCTIONS---------#}
+
+{#---------------------------------------------------------------
+  Converts JSON Schema (dict) to a TypeScript type definition
+----------------------------------------------------------------#}
+{%- macro json_schema_to_typescript(schema, indent="") -%}
+	{%- set ADDITIONAL_JSON_KEYS = ['format', 'maxItems', 'maximum', 'minItems', 'minimum', 'pattern'] -%}
+	{%- set ty = schema.get("type") -%}
+
+	{# ---------------- OBJECT ---------------- #}
+	{%- if ty == "object" -%}
+		{{- "{\n" -}}
+
+		{# Start building property list #}
+		{%- set props = schema.get("properties", {}) -%}
+		{%- set required = schema.get("required", []) -%}
+		{%- set has_additional_props = schema.get("additionalProperties") is defined -%}
+		{%- set additional_props_type = none -%}
+		{%- if has_additional_props -%}
+			{%- if schema.additionalProperties == true -%}
+				{%- set additional_props_type = {'type': 'any'} -%}
+			{%- elif schema.additionalProperties is mapping -%}
+				{%- set additional_props_type = schema.additionalProperties -%}
+			{%- endif -%}
+		{%- endif -%}
+
+		{%- for key, val in props.items() -%}
+			{# ---------- Description Comments ---------- #}
+			{%- if "description" in val -%}
+				{%- for line in val['description'].split('\n') -%}
+					{%- if line.strip() -%}
+						{{- indent + '// ' + line + '\n' -}}
+					{%- endif -%}
+				{%- endfor -%}
+			{%- endif -%}
+
+			{# ---------- Additional JSON Keys ---------- #}
+			{%- for add_key, add_val in val.items() -%}
+				{%- if add_key in ADDITIONAL_JSON_KEYS -%}
+					{%- if add_val is string -%}
+						{{- indent + '// ' + add_key + ': "' + add_val + '"' + '\n'  -}}
+					{%- else -%}
+						{{- indent + '// ' + add_key + ': ' ~ add_val ~ '\n' -}}
+					{%- endif -%}
+				{%- endif -%}
+			{%- endfor -%}
+
+			{# ---------- Property Definition ---------- #}
+            {%- set type_str = json_schema_to_typescript(
+                val, 
+                indent + "  "
+            ) -%}
+
+			{{- indent + key + ('' if key in required else '?') + ': ' + type_str + ',' -}}
+
+			{%- if "default" in val or "defalut_value" in val -%}
+				{%- set default = val.get("default", val.get("defalut_value")) -%}
+				{%- if default is string -%}
+					{{- ' // default: "' + default + '"' -}}
+				{%- else -%}
+					{{- ' // default: ' ~ default -}}
+				{%- endif -%}
+			{%- endif -%}
+
+			{{- "\n" -}}
+		{%- endfor -%}
+
+		{# Handle additionalProperties as index signature #}
+		{%- if has_additional_props and additional_props_type is not none -%}
+			{%- set additional_type_str = json_schema_to_typescript(
+				additional_props_type,
+				indent + "  "
+			) -%}
+			{{- indent + '[key: string]: ' + additional_type_str + '\n' -}}
+		{%- endif -%}
+
+		{{- indent[: (indent|length - "  "|length) ] + '}' -}}
+
+	{# ---------------- STRING ---------------- #}
+	{%- elif ty == "string" -%}
+		{%- if schema.get("enum") -%}
+            {%- set ns = namespace(enum = []) -%}
+            {%- for en in schema['enum'] -%}
+                {%- set ns.enum = ns.enum + ['"' ~ en ~ '"'] -%}
+            {%- endfor -%}
+			{{- ns.enum | join(' | ') -}}
+		{%- elif schema.get("format", "none") in ['date-time', 'date'] -%}
+			{{- 'Date' -}}
+		{%- else -%}
+			{{- 'string' -}}
+		{%- endif -%}
+
+	{# ---------------- NUMBER / INTEGER ---------------- #}
+	{%- elif ty in ["number", "integer"] -%}
+		{%- if schema.get("enum") -%}
+			{{- schema.enum | join(' | ') -}}
+		{%- else -%}
+			{{- 'number' -}}
+		{%- endif -%}
+
+	{# ---------------- BOOLEAN ---------------- #}
+	{%- elif ty == "boolean" -%}
+		{{- 'boolean' -}}
+
+	{# ---------------- ARRAY ---------------- #}
+	{%- elif ty == "array" -%}
+		{%- if "items" in schema -%}
+			{{- json_schema_to_typescript(schema['items'], indent) + '[]' -}}
+		{%- else -%}
+			{{- 'Array<any>' -}}
+		{%- endif -%}
+
+	{# ---------------- FALLBACK ---------------- #}
+	{%- else -%}
+		{{- 'any' -}}
+	{%- endif -%}
+{%- endmacro -%}
+
+{#---------------------------------------------------------------
+  Renders a namespace and its tool definitions in TypeScript style
+----------------------------------------------------------------#}
+
+{%- macro render_tool_namespace(namespace_name, tools) -%}
+	{%- set ns = namespace(sections = ['namespace ' ~ namespace_name ~ ' {']) -%}
+
+	{%- for tool in tools -%}
+		{%- if tool.function -%}
+			{%- set tool = tool.function -%}
+		{%- endif -%}
+
+		{%- set ns_tool = namespace(content_lines=[]) -%}
+
+		{# ---------- TOOL DESCRIPTION ---------- #}
+		{%- if tool.get('description') -%}
+			{%- for line in tool['description'].split('\n') -%}
+				{%- if line.strip() -%}
+					{%- set ns_tool.content_lines = ns_tool.content_lines + ['// ' ~ line] -%}
+				{%- endif -%}
+			{%- endfor -%}
+		{%- endif -%}
+
+		{# ---------- TOOL SIGNATURE ---------- #}
+		{%- set main_body = "" -%}
+		{%- set params = tool.get("parameters") -%}
+		{%- if params and params.get("properties") -%}
+			{%- set param_type = json_schema_to_typescript(params, "  ") -%}
+			{%- set main_body = 'type ' ~ tool.name ~ ' = (_: ' ~ param_type ~ ') => ' -%}
+		{%- else -%}
+			{%- set main_body = 'type ' ~ tool.name ~ ' = () => ' -%}
+		{%- endif -%}
+
+		{# ---------- RETURN TYPE ---------- #}
+		{%- set return_params = tool.get("return_parameters") -%}
+		{%- if return_params and return_params.get("properties") -%}
+			{%- set return_type = json_schema_to_typescript(return_params, "  ") -%}
+			{%- set main_body = main_body ~ return_type -%}
+		{%- else -%}
+			{%- set main_body = main_body ~ 'any' -%}
+		{%- endif -%}
+
+		{%- set main_body = main_body ~ ';\n' -%}
+
+		{%- set ns_tool.content_lines = ns_tool.content_lines + [main_body] -%}
+
+		{# ---------- ADD TOOL TO SECTIONS ---------- #}
+		{%- set ns.sections = ns.sections + [ns_tool.content_lines | join('\n')] -%}
+	{%- endfor -%}
+
+	{%- set ns.sections = ns.sections + ['} // namespace ' ~ namespace_name] -%}
+
+	{{- ns.sections | join('\n') -}}
+{%- endmacro -%}
+
+
+{# ----------- MESSAGE RENDERING HELPER FUNCTIONS ------------ #}
+
+{%- macro render_function_call(call) -%}
+    {%- if call.function -%}
+        {%- set call = call.function -%}
+    {%- endif -%}
+
+    {%- set arguments = call['arguments'] -%}
+	{%- if arguments is not string -%}
+		{%- set arguments = arguments| tojson(ensure_ascii=False) -%}
+	{%- endif -%}
+
+    {{- '{"name": "' ~ call['name'] ~ '", "arguments": ' ~ arguments ~ '}' -}}
+{%- endmacro -%}
+
+
+{%- macro render_role_message(message, role=None) -%}
+    {%- if not role -%}
+        {%- set role = message["role"] -%}
+    {%- endif -%}
+
+    {%- set message_content = message['content'] or '' -%}
+    {%- if message_content is not string -%}
+        {%- set message_content = message_content | tojson(ensure_ascii=False) -%}
+    {%- endif -%}
+
+    {{- role + add_tokens.role_sep + message_content -}}
+
+    {%- if message.tool_calls is defined and message.tool_calls -%}
+        {{- add_tokens.function_call + render_function_call(message.tool_calls[0]) -}}
+    {%- endif -%}
+
+    {{- add_tokens.message_sep -}}
+
+{%- endmacro -%}
+
+
+
+{# ----- SPECIAL TOKENS ----- #}
+
+{%- set add_tokens = namespace(
+    role_sep="<|role_sep|>\n", 
+    message_sep="<|message_sep|>\n\n",
+    function_call="<|function_call|>"
+) -%}
+
+{# ----- DEFAULT DEVSYSTEM ----- #}
+
+{%- set DEVSYSTEM -%}
+<role_description>
+Description of the roles available in the dialog.
+
+`developer system`
+A message added by Sber before the main dialog. It has the highest priority and sets global, non-overridable conditions (for example, conversation rules, the safety policy, the assistant's overall response style, etc.).
+
+`system`
+A system instruction added by developers or by the user, but with a lower priority than `developer system`. It usually describes the assistant's instructions, a specific response style, and other conditions for this particular dialog.
+
+`user`
+A message or request from the user. The assistant follows it if it does not conflict with higher-priority instructions (see <instruction_priority>).
+
+`user memory`
+A sequence of the most up-to-date long-term facts about the user at the time of their request, presented as a JSON list of strings. Facts are listed in chronological order, meaning newer facts are appended to the end of the sequence. When facts are changed or deleted, records of previous facts remain in the sequence. The assistant saves facts using a function and uses them in accordance with the <memory_guidelines> block below.
+
+`added files`
+Metadata about files available for use in the dialog, presented in JSON format. It contains the following keys: id (a unique file identifier), name (file name), type (file type).
+
+`assistant`
+The assistant's reply to the user's request. If the system instruction or the user does not set additional rules for `assistant`, this reply must comply with the instructions in the <assistant_guidelines> block below. The list of functions available to call is contained in `function descriptions`. The name of the required function and its arguments will be generated next by the `function call` role. In its replies, the assistant follows the instructions in accordance with <instruction_priority>.
+
+`function descriptions`
+Function descriptions in TypeScript format. A function is a special tool (or a set of instructions) that the assistant can call to perform specific actions, computations, or obtain data needed to solve the user's task. Each function description contains blocks with the name, description, and arguments. Sometimes the description contains separate blocks with return parameters and usage examples that illustrate the correct call and arguments.
+
+`function call`
+The function that `assistant` calls based on the dialog context, and its arguments. The function is invoked in strict accordance with the instructions in the <function_usage> block.
+
+`function result`
+The result of the last function call.
+</role_description>
+
+<available_modalities>
+The assistant can work with the following modalities: text, available functions.
+</available_modalities>
+
+<instruction_priority>
+If instructions from different roles conflict within the dialog context, observe the following priorities:  
+`developer system` > `system` > `user` > `function descriptions` > `function result` > `user memory`
+</instruction_priority>
+
+<function_usage>
+Basic instructions for working with functions.
+
+Only call those functions that are described in `function descriptions`.
+
+Call available functions when, according to their description, such a call will help provide a more complete and/or accurate answer to the user's request. Fill in function arguments using information from the dialog context. If a function could help answer the request but a required argument is missing from the context, ask the user for the missing data before calling the function. If a necessary function is unavailable or an error occurs, briefly inform the user and, if possible, suggest an alternative.
+</function_usage>
+
+<memory_guidelines>
+Rules for using facts in long-term memory:
+
+If there is no message under the `user memory` role in the dialog, this is equivalent to the absence of long-term facts about the user in memory. In that case, information about the user is limited to the current dialog, and no new facts should be saved.
+</memory_guidelines>
+
+<assistant_guidelines>
+You are a helpful assistant.
+
+# Instructions
+- Strictly follow the instruction priority.
+- Maintain a logical chain of reasoning when answering the user's question.
+- For complex questions (for example, STEM), try to answer in detail unless the system message or dialog context limits the response length.
+- Be helpful, truthful, and avoid unsafe or prohibited content in your responses.
+- Try to reply in the language in which the user asked their question.
+</assistant_guidelines>
+
+A dialog will follow below.
+The dialog may include various roles described in the <role_description> block.
+Each turn begins with the role name and a special token that marks the end of the role's full name, and ends with a special end-of-turn token.
+Your task is to continue the dialog from the last specified role in accordance with the dialog context.
+{%- endset -%}
+
+
+{#- ---------------------- RENDERING STARTS HERE ---------------------- -#}
+
+
+{# ----- RENDER BOS TOKEN ----- #}
+{{- bos_token -}}
+
+
+{# ----- RENDER DEVSYSTEM ----- #}
+{{- render_role_message({"role": "developer system", "content": DEVSYSTEM}) -}}
+
+{# ----- RENDER SYSTEM IF PRESENT ----- #}
+{%- if messages and messages[0]['role'] == 'system' -%}
+    {{- render_role_message(messages[0]) -}}
+    {%- set messages = messages[1:] -%}
+{%- endif -%}
+
+{# ----- RENDER TOOLS ----- #}
+{%- if tools -%}
+    {%- set tools_content = (
+        render_tool_namespace('functions', tools) 
+        + "\n\n"
+    ) -%}
+    {{- render_role_message({'role': 'function descriptions', 'content': tools_content}) -}}
+{%- endif -%}
+
+{# ----- MAIN MESSAGE LOOP ----- #}
+{%- for message in messages -%}
+
+    {# ----- TOOL MESSAGE -------#}
+    {%- if message['role'] == 'tool' -%}
+        {{- render_role_message(message, 'function result') -}}
+        
+    {# ----- OTHER MESSAGES ----- #}
+    {%- else -%}
+        {{- render_role_message(message) -}}
+    {%- endif -%}
+
+    {# ----- ADDING GENERATION PROMPT ----- #}
+
+    {%- if loop.last and add_generation_prompt and message['role'] != 'assistant' -%}
+        {{- 'assistant' + add_tokens.role_sep -}}
+    {%- endif -%}
+
+{%- endfor -%}
@@ -900,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec(

 ggml_tensor * llm_graph_context::build_lora_mm(
          ggml_tensor * w,
-          ggml_tensor * cur) const {
+          ggml_tensor * cur,
+          ggml_tensor * w_s) const {
    ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);

    for (const auto & lora : *loras) {
@@ -921,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
        res = ggml_add(ctx0, res, ab_cur);
    }

+    if (w_s) {
+        res = ggml_mul(ctx0, res, w_s);
+    }
+
    return res;
 }

@@ -764,10 +764,11 @@ struct llm_graph_context {
             ggml_tensor * cur,
                     int   il) const;

-    // do mat_mul, while optionally apply lora
+    // do mat_mul, while optionally apply lora and per-tensor scale
    ggml_tensor * build_lora_mm(
              ggml_tensor * w,
-              ggml_tensor * cur) const;
+              ggml_tensor * cur,
+              ggml_tensor * w_s = nullptr) const;

    // do mat_mul_id, while optionally apply lora
    ggml_tensor * build_lora_mm_id(
@@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
        // self-attention
        {
            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            if (model.layers[il].wq_s) {
-                Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
-            }
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
            cb(Qcur, "Qcur", il);
            if (model.layers[il].bq) {
                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
            }

            // B1.K
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            if (model.layers[il].wk_s) {
-                Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
-            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
            cb(Kcur, "Kcur", il);
            if (model.layers[il].bk) {
                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
            }

            // B1.V
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            if (model.layers[il].wv_s) {
-                Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
-            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
            cb(Vcur, "Vcur", il);
            if (model.layers[il].bv) {
                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
                    LLM_NORM_RMS, il);
            cb(cur, "attn_sub_norm", il);

-            cur = build_lora_mm(model.layers[il].wo, cur);
-            if (model.layers[il].wo_s) {
-                cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
-            }
+            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
            if (model.layers[il].bo) {
                cur = ggml_add(ctx0, cur, model.layers[il].bo);
            }
@@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
                LLM_NORM_RMS, il);
        cb(cur, "ffn_sub_norm", il);

-        cur = build_lora_mm(model.layers[il].ffn_down, cur);
-        if (model.layers[il].ffn_down_s) {
-            cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_s);
-        }
+        cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
        cb(cur, "ffn_down", il);

        cur = ggml_add(ctx0, cur, ffn_inp);
@@ -43,28 +43,19 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            if (model.layers[il].wq_s) {
-                Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
-            }
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
            cb(Qcur, "Qcur", il);
            if (model.layers[il].bq) {
                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                cb(Qcur, "Qcur", il);
            }
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            if (model.layers[il].wk_s) {
-                Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
-            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
            cb(Kcur, "Kcur", il);
            if (model.layers[il].bk) {
                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                cb(Kcur, "Kcur", il);
            }
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            if (model.layers[il].wv_s) {
-                Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
-            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
            cb(Vcur, "Vcur", il);
            if (model.layers[il].bv) {
                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -30,22 +30,13 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
        // self-attention
        {
            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            if (model.layers[il].wq_s) {
-                Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
-            }
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
            cb(Qcur, "Qcur", il);

-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            if (model.layers[il].wk_s) {
-                Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
-            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
            cb(Kcur, "Kcur", il);

-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            if (model.layers[il].wv_s) {
-                Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
-            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
            cb(Vcur, "Vcur", il);

            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
@@ -30,22 +30,13 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
        // self_attention
        {
            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            if (model.layers[il].wq_s) {
-                Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
-            }
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
            cb(Qcur, "Qcur", il);

-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            if (model.layers[il].wk_s) {
-                Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
-            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
            cb(Kcur, "Kcur", il);

-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            if (model.layers[il].wv_s) {
-                Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
-            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
            cb(Vcur, "Vcur", il);

            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
@@ -7656,7 +7656,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
    test_cases.emplace_back(new test_silu_back());

-    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) {
+    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f, 10.f }) {
        for (uint32_t n : { 64, 1025 }) {
            for (bool v : { false, true }) {
                test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
@@ -2765,6 +2765,42 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();

    }
+
+    // GigaChat V3
+    {
+        auto tst = peg_tester("models/templates/GigaChat3-10B-A1.8B.jinja", detailed_debug);
+        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+        tst.test("<|message_sep|>\n\nfunction call<|role_sep|>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
+            .tools({ special_function_tool })
+            .expect(message_assist_call)
+            .run();
+
+        tst.test(
+            "Hello, world!\nWhat's up?"
+            "<|message_sep|>\n\nfunction call<|role_sep|>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
+        )
+            .tools({ special_function_tool })
+            .expect(message_assist_call_content)
+            .run();
+    }
+
+    // GigaChat V3.1
+    {
+        auto tst = peg_tester("models/templates/GigaChat3.1-10B-A1.8B.jinja", detailed_debug);
+        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
+        tst.test("<|function_call|>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
+            .tools({ special_function_tool })
+            .expect(message_assist_call)
+            .run();
+
+        tst.test(
+            "Hello, world!\nWhat's up?"
+            "<|function_call|>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}"
+        )
+            .tools({ special_function_tool })
+            .expect(message_assist_call_content)
+            .run();
+    }
 }

 // Test the developer role to system workaround with a simple mock template
@@ -216,6 +216,7 @@ enum projector_type {
    PROJECTOR_TYPE_GEMMA3,
    PROJECTOR_TYPE_GEMMA3NV,
    PROJECTOR_TYPE_GEMMA3NA,
+    PROJECTOR_TYPE_PHI4,
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
@@ -253,6 +254,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
+    { PROJECTOR_TYPE_PHI4,      "phi4"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
@@ -792,6 +792,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_LFM2:
        case PROJECTOR_TYPE_JANUS_PRO:
+        case PROJECTOR_TYPE_PHI4:
            {
                builder = std::make_unique<clip_graph_siglip>(ctx, img);
            } break;
@@ -1144,6 +1145,13 @@ struct clip_model_loader {
                        // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
                        hparams.set_limit_image_tokens(64, 256);
                    } break;
+                case PROJECTOR_TYPE_PHI4:
+                    {
+                        hparams.n_merge = 1;
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
+                        hparams.set_warmup_n_tokens(16*16);
+                    } break;
                case PROJECTOR_TYPE_PIXTRAL:
                case PROJECTOR_TYPE_LIGHTONOCR:
                    {
@@ -1841,6 +1849,13 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                } break;
+            case PROJECTOR_TYPE_PHI4:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
            case PROJECTOR_TYPE_LFM2A:
                {
                    for (int i : {0, 2, 3, 5, 6}) {
@@ -3157,6 +3172,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                res_imgs->entries.push_back(std::move(img_f32));
            } break;

+        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            {
@@ -3383,6 +3399,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_MLP:
        case PROJECTOR_TYPE_MLP_NORM:
        case PROJECTOR_TYPE_JANUS_PRO:
+        case PROJECTOR_TYPE_PHI4:
            {
                // do nothing
            } break;
@@ -3884,6 +3901,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_VOXTRAL:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
        case PROJECTOR_TYPE_JANUS_PRO:
+        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_COGVLM:
            {
                // do nothing
@@ -4013,6 +4031,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_LDPV2:
            return ctx->model.mm_model_peg_0_b->ne[0];
        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
            return ctx->model.mm_2_w->ne[1];
@@ -4,7 +4,7 @@ ggml_cgraph * clip_graph_siglip::build() {
    ggml_tensor * inp = build_inp();

    ggml_tensor * learned_pos_embd = model.position_embeddings;
-    if (proj_type == PROJECTOR_TYPE_LFM2) {
+    if (proj_type == PROJECTOR_TYPE_LFM2 || proj_type == PROJECTOR_TYPE_PHI4) {
        learned_pos_embd = resize_position_embeddings();
    }

@@ -75,6 +75,14 @@ ggml_cgraph * clip_graph_siglip::build() {
            hparams.ffn_op,
            -1);

+    } else if (proj_type == PROJECTOR_TYPE_PHI4) {
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+
    } else {
        GGML_ABORT("SigLIP: Unsupported projector type");
    }
@@ -290,6 +290,9 @@ struct mtmd_context {
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";

+        } else if (proj == PROJECTOR_TYPE_PHI4) {
+            // Phi-4 uses media marker insertion only. Keep image boundary text empty.
+
        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
            // (more details in mtmd_context constructor)
            img_beg = "<|image_start|>";
Author	SHA1	Message	Date
Jeff Bolz	246ffc4b05	vulkan: fix l2_norm epsilon handling (#20350 )	2026-03-12 06:39:41 +01:00
Jeff Bolz	aa429cf507	vulkan: fix OOB check in flash_attn_mask_opt (#20296 )	2026-03-12 06:35:49 +01:00
Masato Nakasaka	5866e3bbc8	vulkan: Fix ErrorOutOfHostMemory on Intel GPU when loading large models with --no-mmap (#20059 ) * Changed to reuse command buffers to fix crashing on Intel GPU * Removed unused parameter * Fixed compile error and minor mistake * Fix logging * Changing to use usage flag per command buffer * fixed style * added buffer reset * Removed cmd_buffer_idx for reuse consistency * Fixed style	2026-03-12 06:30:16 +01:00
lhez	0516e04bf9	opencl: use larger workgroup size for get_rows (#20316 )	2026-03-11 22:03:27 -07:00
shaofeiqi	3d9ab225e7	opencl: add cumsum op (#18981 ) * OpenCL: add CUMSUM op support * remove unused argument * opencl: refactor cumsum * opencl: refactor * opencl: refactor tmp buffer * opencl: adjust max number of subgroups * opencl: fix whitespace * opencl: fix global size when cumsum the tmp buffer --------- Co-authored-by: Li He <lih@qti.qualcomm.com>	2026-03-11 22:03:07 -07:00
uvos	d63aa398de	hip: compile debug builds with -O2 on hip to avoid a compiler bug (#20392 )	2026-03-12 10:37:10 +08:00
Mishusha	a8304b4d27	common/parser: add GigaChatV3/3.1 models support (#19931 ) Co-authored-by: Mishusha <pmv26021975@gmail.com>	2026-03-12 01:22:25 +01:00
DAN™	fdb17643d3	model : add support for Phi4ForCausalLMV (#20168 ) * Add support for Phi4ForCausalLMV. * Fix Phi-4 vision parity (correcting SigLIP2 patch-kernel export layout) and matching HF NaFlex resize behavior in mtmd. * Rename contants + fix tokenizer label * Clean-ups. * Fix GGUF export. * Set tokenizer.ggml.pre explicitly. * Default vocab name rather than forcing it. * Clean-ups. * Fix indent. * Fix subscriptable error. * remov overcomplicated code path * Clean-ups. --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2026-03-12 00:25:54 +01:00
Richard Davison	1eea6a2968	graph : add optional scale parameter to build_lora_mm [no ci] (#20427 )	2026-03-12 00:22:49 +01:00
ddh0	4a748b8f15	common : fix --n-cpu-moe, --cpu-moe for models with fused gate + up (#20416 )	2026-03-12 00:13:28 +01:00