model: fix multimodal padding token for gemma3n/gemma4 (#21625 )

* model: fix multimodal padding token for gemma3n/gemma4 * nits
mtmd: support dots.ocr (#17575 )
2026-06-30 17:47:40 +02:00 · 2026-04-09 12:18:23 +02:00 · 2026-04-09 12:16:38 +02:00 · 2026-04-09 11:41:14 +02:00 · 2026-04-09 11:28:33 +02:00 · 2026-04-09 10:54:00 +03:00
26 changed files with 570 additions and 68 deletions
@@ -251,6 +251,23 @@ value binary_expression::execute_impl(context & ctx) {
        return res;
    }

+    // Python-style string repetition
+    // TODO: support array/tuple repetition (e.g., [1, 2] * 3 → [1, 2, 1, 2, 1, 2])
+    if (op.value == "*" &&
+            ((is_val<value_string>(left_val) && is_val<value_int>(right_val)) ||
+             (is_val<value_int>(left_val) && is_val<value_string>(right_val)))) {
+        const auto & str = is_val<value_string>(left_val) ? left_val->as_string() : right_val->as_string();
+        const int64_t repeat = is_val<value_int>(right_val) ? right_val->as_int() : left_val->as_int();
+        auto res = mk_val<value_string>();
+        if (repeat <= 0) {
+            return res;
+        }
+        for (int64_t i = 0; i < repeat; ++i) {
+            res->val_str = res->val_str.append(str);
+        }
+        return res;
+    }
+
    // String membership
    if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
        // case: "a" in "abc"
@@ -1,4 +1,5 @@
 #include "runtime.h"
+#include "unicode.h"
 #include "value.h"

 // for converting from JSON to jinja values
@@ -154,6 +155,83 @@ static value test_compare_fn(const func_args & args) {
    return mk_val<value_bool>(value_compare(args.get_pos(0), args.get_pos(1), op));
 }

+static void append_codepoint_as_ascii_json_escape(std::string & out, uint32_t codepoint) {
+    auto append_u16 = [&out](uint32_t value) {
+        char buf[8];
+        snprintf(buf, sizeof(buf), "\\u%04x", static_cast<unsigned int>(value));
+        out += buf;
+    };
+
+    if (codepoint <= 0xFFFF) {
+        append_u16(codepoint);
+        return;
+    }
+
+    codepoint -= 0x10000;
+    append_u16(0xD800 + ((codepoint >> 10) & 0x3FF));
+    append_u16(0xDC00 + (codepoint & 0x3FF));
+}
+
+static std::string json_ensure_ascii_preserving_format(const std::string & json_str) {
+    std::string output;
+    output.reserve(json_str.size());
+
+    bool in_string = false;
+    bool escaped = false;
+
+    for (size_t pos = 0; pos < json_str.size();) {
+        const char ch = json_str[pos];
+        if (!in_string) {
+            output.push_back(ch);
+            if (ch == '"') {
+                in_string = true;
+            }
+            ++pos;
+            continue;
+        }
+
+        if (escaped) {
+            output.push_back(ch);
+            escaped = false;
+            ++pos;
+            continue;
+        }
+
+        if (ch == '\\') {
+            output.push_back(ch);
+            escaped = true;
+            ++pos;
+            continue;
+        }
+
+        if (ch == '"') {
+            output.push_back(ch);
+            in_string = false;
+            ++pos;
+            continue;
+        }
+
+        const unsigned char uch = static_cast<unsigned char>(ch);
+        if (uch < 0x80) {
+            output.push_back(ch);
+            ++pos;
+            continue;
+        }
+
+        auto parsed = common_parse_utf8_codepoint(json_str, pos);
+        if (parsed.status != utf8_parse_result::SUCCESS) {
+            output += "\\ufffd";
+            ++pos;
+            continue;
+        }
+
+        append_codepoint_as_ascii_json_escape(output, parsed.codepoint);
+        pos += parsed.bytes_consumed;
+    }
+
+    return output;
+}
+
 static value tojson(const func_args & args) {
    args.ensure_count(1, 5);
    value val_ascii      = args.get_kwarg_or_pos("ensure_ascii", 1);
@@ -169,16 +247,17 @@ static value tojson(const func_args & args) {
    if (is_val<value_int>(val_indent)) {
        indent = static_cast<int>(val_indent->as_int());
    }
-    if (val_ascii->as_bool()) { // undefined == false
-        throw not_implemented_exception("tojson ensure_ascii=true not implemented");
-    }
    if (val_sort->as_bool()) { // undefined == false
        throw not_implemented_exception("tojson sort_keys=true not implemented");
    }
+    const bool ensure_ascii = val_ascii->as_bool(); // undefined == false
    auto separators = (is_val<value_array>(val_separators) ? val_separators : mk_val<value_array>())->as_array();
    std::string item_sep = separators.size() > 0 ? separators[0]->as_string().str() : (indent < 0 ? ", " : ",");
    std::string key_sep = separators.size() > 1 ? separators[1]->as_string().str() : ": ";
    std::string json_str = value_to_json(args.get_pos(0), indent, item_sep, key_sep);
+    if (ensure_ascii) {
+        json_str = json_ensure_ascii_preserving_format(json_str);
+    }
    return mk_val<value_string>(json_str);
 }

@@ -460,6 +539,10 @@ const func_builtins & value_int_t::get_builtins() const {
            int64_t val = args.get_pos(0)->as_int();
            return mk_val<value_int>(val < 0 ? -val : val);
        }},
+        {"int", [](const func_args & args) -> value {
+            args.ensure_vals<value_int>();
+            return mk_val<value_int>(args.get_pos(0)->as_int());
+        }},
        {"float", [](const func_args & args) -> value {
            args.ensure_vals<value_int>();
            double val = static_cast<double>(args.get_pos(0)->as_int());
@@ -486,6 +569,10 @@ const func_builtins & value_float_t::get_builtins() const {
            int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
            return mk_val<value_int>(val);
        }},
+        {"float", [](const func_args & args) -> value {
+            args.ensure_vals<value_float>();
+            return mk_val<value_float>(args.get_pos(0)->as_float());
+        }},
        {"safe", tojson},
        {"string", tojson},
        {"tojson", tojson},
@@ -3777,7 +3777,14 @@ class QwenModel(TextModel):
        self._set_vocab_qwen()


-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
+@ModelBase.register(
+    "Qwen2Model",
+    "Qwen2ForCausalLM",
+    "Qwen2AudioForConditionalGeneration",
+    "KORMoForCausalLM",
+    "AudioFlamingo3ForConditionalGeneration",
+    "DotsOCRForCausalLM",
+)
 class Qwen2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.QWEN2

@@ -3798,7 +3805,8 @@ class Qwen2Model(TextModel):
            name = name.replace("language_model.", "") # for InternVL
        if name.startswith("mlp") or name.startswith("multi_modal_projector") \
                or name.startswith("vision_model") or name.startswith("audio_tower") \
-                or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
+                or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
+                or name.startswith("vision_tower."):
            # skip vision and audio tensors
            return
        yield from super().modify_tensors(data_torch, name, bid)
@@ -12819,6 +12827,37 @@ class SolarOpenModel(Glm4MoeModel):
        special_vocab.add_to_gguf(self.gguf_writer)


+@ModelBase.register("DotsOCRForCausalLM")
+class DotsOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.hparams_vision["image_size"] = 0 # dynamic resolution
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
+        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
+        self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
+        self.gguf_writer.add_vision_use_silu(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("vision_tower."):
+            if "vision_tower.blocks." in name and ".mlp." in name:
+                # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
+                # x = F.silu(self.fc1(x)) * self.fc3(x)
+                # x = self.fc2(x)
+                # fc1 -> gate, fc2 -> down, fc3 -> up
+                # mapping original names to Qwen2.5 naming scheme
+                name = name.replace("vision_tower.blocks.", "visual.blocks.")
+                name = name.replace(".fc1", ".gate_proj")
+                name = name.replace(".fc2", ".down_proj")
+                name = name.replace(".fc3", ".up_proj")
+            yield from super().modify_tensors(data_torch, name, bid)
+
+
 ###### CONVERSION LOGIC ######


@@ -37,6 +37,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 > - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
 > - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
 > - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
+> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575
 > - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395

 ## Pre-quantized models
@@ -10079,6 +10079,7 @@ template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mul_mm_id kernel_m
 #if defined(GGML_METAL_HAS_BF16)
 template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16,    bfloat, bfloat4x4, float, float2x4>;
 #endif
+template [[host_name("kernel_mul_mm_id_q1_0_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q1_0,    8,     dequantize_q1_0,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0,    float,  float4x4,  float, float2x4>;
@@ -10102,6 +10103,7 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mul_mm_id kernel_m

 template [[host_name("kernel_mul_mm_id_f32_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32,     float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_f16_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16,     half,   half4x4,   half, half2x4>;
+template [[host_name("kernel_mul_mm_id_q1_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q1_0,    8,     dequantize_q1_0,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q4_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q4_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q5_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0,    float,  float4x4,  half, half2x4>;
@@ -4122,6 +4122,7 @@ class VisionProjectorType:
    LIGHTONOCR = "lightonocr"
    COGVLM = "cogvlm"
    JANUS_PRO = "janus_pro"
+    DOTSOCR = "dots_ocr"
    DEEPSEEKOCR = "deepseekocr"
    LFM2A = "lfm2a" # audio
    MUSIC_FLAMINGO = "musicflamingo" # audio
@@ -1359,6 +1359,7 @@ class TensorNameMap:
            "visual.merger.mlp.{bid}", # qwen2vl
            "mlp_AR.linear_{bid}", # PaddleOCR-VL
            "merger.mlp.{bid}",
+            "vision_tower.merger.mlp.{bid}", # dots.ocr
            "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
        ),

@@ -1406,11 +1407,13 @@ class TensorNameMap:
            "siglip2.vision_model.embeddings.patch_embedding",
            "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
            "model.vision_tower.patch_embedder.input_proj", # gemma4
+            "vision_tower.patch_embed.patchifier.proj", # dots.ocr
            "vision_model.conv1", # Step3-VL
        ),

        MODEL_TENSOR.V_ENC_EMBD_NORM: (
            "visual.post_conv_layernorm", # glm4v
+            "vision_tower.patch_embed.patchifier.norm", # dots.ocr
        ),

        MODEL_TENSOR.V_ENC_EMBD_POS: (
@@ -1441,6 +1444,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_ATTN_QKV: (
            "visual.blocks.{bid}.attn.qkv", # qwen3vl
+            "vision_tower.blocks.{bid}.attn.qkv", # dots.ocr
            "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
            "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
            "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
@@ -1526,6 +1530,7 @@ class TensorNameMap:
            "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
            "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
            "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
+            "vision_tower.blocks.{bid}.norm1", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
        ),

@@ -1547,6 +1552,7 @@ class TensorNameMap:
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
            "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
            "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
+            "vision_tower.blocks.{bid}.attn.proj", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
        ),

@@ -1567,6 +1573,7 @@ class TensorNameMap:
            "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
            "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
            "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
+            "vision_tower.blocks.{bid}.norm2", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
        ),

@@ -1649,6 +1656,7 @@ class TensorNameMap:
            "vision_encoder.ln_pre", # pixtral
            "vision_model.layernorm_pre", # llama4
            "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
+            "vision_tower.patch_embed.patchifier.norm", # dots.ocr
            "vision_model.ln_pre", # Step3-VL
        ),

@@ -1664,6 +1672,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_MM_POST_NORM: (
            "visual.merger.post_projection_norm", # glm4v
+            "vision_tower.post_trunk_norm", # dots.ocr
            "vit.perceive.after_rms", # HunyuanOCR
        ),

@@ -1680,6 +1689,7 @@ class TensorNameMap:
            "model.vision.linear_proj.norm1", # cogvlm
            "mlp_AR.pre_norm", # PaddleOCR-VL
            "merger.ln_q",
+            "vision_tower.merger.ln_q", # dots.ocr
        ),

        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
@@ -0,0 +1,111 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Äpfel
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
@@ -0,0 +1,46 @@
+1178 236743 236812 47041 3794
+239122 22744 535
+
+236743
+138
+139
+255968
+107
+108
+109
+255968 107
+9259 1902
+26352 1902
+9259 4109
+26352 4109
+26352 4109 236888
+9259 236764 1902 236888
+26352 236764 1902 236888
+672 563 236743 478 397 404 391 236761 12362
+236765 236771 236812 236828 236743 236832 11372 12065 31806 3405 9360
+1337 12515 1333 4632 165543 3830
+234889 63031 219876 66212 239077 237907 144494
+242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 568 8960 64334 600 815 1061 1852 8369 236768
+9259
+26352
+138 9259
+139 9259
+140 9259
+140 9259 107 140 9259
+568
+107 578
+236789 6933
+9259 236764 570 236789 712 236888 2088 659 611 170124 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352
+123947
+236800
+236800 236800
+236800 236800 236800
+236800 236800 236800 236800
+236800 236800 236800 236800 236800
+236800 236800 236800 236800 236800 236800
+236800 236800 236800 236800 236800 236800 236800
+236800 236800 236800 236800 236800 236800 236800 236800
+236800 236800 236800 236800 236800 236800 236800 236800 236800
+236780 29719 33154
+2243 2206
+107 236743 108 236743 109 236743 255968 236743 255969 236743 255968 107 138 107 139 107 140 107 141 107 242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 236743 478 397 404 391 478 397 404 391 236743 236800 236743 236800 236800 236743 236800 236800 236800 236743 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236800 236743 236800 236761 236800 236743 236800 856 236800 236743 236800 1390 236800 90986 92814 63031 219876 66212 241702 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352 80448 120697 210119 1333 4632 165543 3830 9451 159561 2629 2629 2717 84491 19938 123947 38950 10371 564 236789 560 1010 756 151812 668 236789 236751 993 236764 756 1357 611 2889 236881 756 236792 711 2889 564 236789 859 1386 625 236764 756 236796 611 1133 1070 11115 236881 1191 236789 32541 496 236789 95635
@@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {

                if (token == LLAMA_TOKEN_NULL) {
                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        std::string byte_str(1, *j);
-                        auto token_multibyte = vocab.text_to_token(byte_str);
+                        llama_token token_multibyte = LLAMA_TOKEN_NULL;
+                        if (tokenizer.byte_encode) {
+                            std::string byte_str(1, *j);
+                            token_multibyte = vocab.text_to_token(byte_str);
+                        } else {
+                            // For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
+                            static const char * hex = "0123456789ABCDEF";
+                            const uint8_t ch = (uint8_t)*j;
+                            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+                            token_multibyte = vocab.text_to_token(buf);
+                        }
                        if (token_multibyte != LLAMA_TOKEN_NULL) {
                            output.push_back(token_multibyte);
                        }
@@ -250,27 +250,29 @@ ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
 ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() {
    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
    ggml_tensor * inp_per_layer;
+    float tok_embd_scale = sqrtf((float) n_embd_altup);
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        ggml_set_input(inp->tokens);
        res->t_inp_tokens = inp->tokens;
-        inp_per_layer = ggml_get_rows(ctx0, model.per_layer_tok_embd, inp->tokens);
+        inp_per_layer = ggml_get_rows  (ctx0, model.per_layer_tok_embd, inp->tokens);
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
-        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
+        inp_per_layer = ggml_scale     (ctx0, inp_per_layer, tok_embd_scale);
        cb(inp_per_layer, "inp_per_layer_selected", -1);
        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
+        // Multimodal embedding path: use padding token (ID=0) embedding
        // TODO: verify if this is the correct behavior in transformers implementation
        const int64_t embd_size = model.per_layer_tok_embd->ne[0];  // n_embd_altup * n_layer

        // Extract and dequantize padding token embedding (row 0)
        ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
-        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
+        inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
+        inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);

        // Reshape to [n_embd_altup, n_layer, 1]
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
-        cb(inp_per_layer, "inp_per_layer_vision", -1);
+        cb(inp_per_layer, "inp_per_layer_multimodal", -1);
    }
    return inp_per_layer;
 }
@@ -265,6 +265,7 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);

    ggml_tensor * inp_per_layer;
+    float tok_embd_scale = sqrtf((float) n_embd_per_layer);
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        ggml_set_input(inp->tokens);
@@ -272,22 +273,23 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {

        inp_per_layer = ggml_get_rows  (ctx0, model.per_layer_tok_embd, inp->tokens);
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
-        inp_per_layer = ggml_scale     (ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer));
+        inp_per_layer = ggml_scale     (ctx0, inp_per_layer, tok_embd_scale);
        cb(inp_per_layer, "inp_per_layer_selected", -1);

        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
+        // Multimodal embedding path: use padding token (ID=0) embedding
        // TODO: verify if this is the correct behavior in transformers implementation
        const int64_t embd_size = model.per_layer_tok_embd->ne[0];  // n_embd_per_layer * n_layer

        // Extract and dequantize padding token embedding (row 0)
        ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0);
-        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
+        inp_per_layer = ggml_cast (ctx0, padding, GGML_TYPE_F32);
+        inp_per_layer = ggml_scale(ctx0, inp_per_layer, tok_embd_scale);

        // Reshape to [n_embd_per_layer, n_layer, 1]
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);
-        cb(inp_per_layer, "inp_per_layer_vision", -1);
+        cb(inp_per_layer, "inp_per_layer_multimodal", -1);
    }
    return inp_per_layer;
 }
@@ -124,6 +124,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${PROJE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gemma-4           ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gemma-4.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
@@ -447,6 +447,18 @@ static void test_expressions(testing & t) {
        "hello world"
    );

+    test_template(t, "string repetition",
+        "{{ 'ab' * 3 }}",
+        json::object(),
+        "ababab"
+    );
+
+    test_template(t, "reversed string repetition",
+        "{{ 3 * 'ab' }}",
+        json::object(),
+        "ababab"
+    );
+
    test_template(t, "ternary",
        "{{ 'yes' if cond else 'no' }}",
        {{"cond", true}},
@@ -693,6 +705,33 @@ static void test_filters(testing & t) {
        "\"\\u2713\""
    );

+    test_template(t, "tojson ensure_ascii=true nested object",
+        "{{ data|tojson(ensure_ascii=true) }}",
+        {{"data", {
+            {"text", "\u2713"},
+            {"items", json::array({"é", {{"snowman", "☃"}}})}
+        }}},
+        "{\"text\": \"\\u2713\", \"items\": [\"\\u00e9\", {\"snowman\": \"\\u2603\"}]}"
+    );
+
+    test_template(t, "tojson ensure_ascii=true indent=2",
+        "{{ data|tojson(ensure_ascii=true, indent=2) }}",
+        {{"data", {
+            {"text", "\u2713"},
+            {"nested", {{"accent", "é"}}}
+        }}},
+        "{\n  \"text\": \"\\u2713\",\n  \"nested\": {\n    \"accent\": \"\\u00e9\"\n  }\n}"
+    );
+
+    test_template(t, "tojson ensure_ascii=true preserves existing escapes",
+        "{{ data|tojson(ensure_ascii=true) }}",
+        {{"data", {
+            {"emoji", "😀"},
+            {"line", "a\nb"}
+        }}},
+        "{\"emoji\": \"\\ud83d\\ude00\", \"line\": \"a\\nb\"}"
+    );
+
    test_template(t, "tojson sort_keys=true",
        "{{ data|tojson(sort_keys=true) }}",
        {{"data", {{"b", 2}, {"a", 1}}}},
@@ -771,6 +810,12 @@ static void test_filters(testing & t) {
        "hello"
    );

+    test_template(t, "int filter on integer is identity",
+        "{{ value|int }}",
+        {{"value", 7}},
+        "7"
+    );
+
    test_template(t, "none to string",
        "{{ x|string }}",
        {{"x", nullptr}},
@@ -2458,4 +2503,12 @@ static void test_fuzzing(testing & t) {
            t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars));
        }
    });
+
+    t.test("tojson ensure_ascii=true with invalid utf-8", [&](testing & t) {
+        t.assert_true("invalid utf-8 does not crash",
+            fuzz_test_template(
+                "{{ data|tojson(ensure_ascii=true) }}",
+                {{"data", std::string("hello\xfe\xffworld")}}
+            ));
+    });
 }
@@ -17,6 +17,7 @@ add_library(mtmd
            models/models.h
            models/cogvlm.cpp
            models/conformer.cpp
+            models/dotsocr.cpp
            models/gemma4v.cpp
            models/glm4v.cpp
            models/hunyuanocr.cpp
@@ -266,6 +266,7 @@ enum projector_type {
    PROJECTOR_TYPE_LIGHTONOCR,
    PROJECTOR_TYPE_COGVLM,
    PROJECTOR_TYPE_JANUS_PRO,
+    PROJECTOR_TYPE_DOTS_OCR,
    PROJECTOR_TYPE_DEEPSEEKOCR,
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
@@ -308,6 +309,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+    { PROJECTOR_TYPE_DOTS_OCR,  "dots_ocr"},
    { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
@@ -853,6 +853,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_pixtral>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_DOTS_OCR:
+            {
+                builder = std::make_unique<clip_graph_dotsocr>(ctx, img);
+            } break;
        case PROJECTOR_TYPE_QWEN2VL:
        case PROJECTOR_TYPE_QWEN25VL:
            {
@@ -1269,6 +1273,14 @@ struct clip_model_loader {
                        get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                    } break;
+                case PROJECTOR_TYPE_DOTS_OCR:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge);
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+                    } break;
                case PROJECTOR_TYPE_KIMIVL:
                    {
                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
@@ -1983,6 +1995,17 @@ struct clip_model_loader {
                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
                } break;
+            case PROJECTOR_TYPE_DOTS_OCR:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
+                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
+                    // post_trunk_norm: applied after all ViT blocks, before the merger
+                    model.post_ln_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+                } break;
            case PROJECTOR_TYPE_ULTRAVOX:
                {
                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -2763,6 +2786,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                n_patches = x_patch * y_patch;
            } break;
        case PROJECTOR_TYPE_PADDLEOCR:
+        case PROJECTOR_TYPE_DOTS_OCR:
            {
                // dynamic size
                int n_merge = ctx->model.hparams.n_merge;
@@ -3071,6 +3095,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                    }
                }

+                set_input_i32("positions", positions);
+            } break;
+        case PROJECTOR_TYPE_DOTS_OCR:
+            {
+                const int pw = image_size_width / patch_size;
+                const int ph = image_size_height / patch_size;
+                const int n_pos = ph * pw;
+                std::vector<int> positions(n_pos * 4);
+                int ptr = 0;
+
+                // flat layout: [h, w, h, w] for each patch
+                // patches are in raster order (matching conv2d output)
+                for (int y = 0; y < ph; y++) {
+                    for (int x = 0; x < pw; x++) {
+                        positions[          ptr] = y;
+                        positions[  n_pos + ptr] = x;
+                        positions[2*n_pos + ptr] = y;
+                        positions[3*n_pos + ptr] = x;
+                        ptr++;
+                    }
+                }
+
                set_input_i32("positions", positions);
            } break;
        case PROJECTOR_TYPE_QWEN25VL:
@@ -3388,6 +3434,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_PIXTRAL:
        case PROJECTOR_TYPE_LIGHTONOCR:
+        case PROJECTOR_TYPE_DOTS_OCR:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_MLP_NORM:
            return ctx->model.mm_3_b->ne[0];
@@ -0,0 +1,49 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_dotsocr::build() {
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    // note: similar to PaddleOCR
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return ggml_rope_multi(
+                    ctx0, cur, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
+                    32768, 10000, 1, 0, 1, 32, 1);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_RMS,
+                            hparams.ffn_op,
+                            nullptr,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    // dots.ocr patch merger + projector
+    {
+        GGML_ASSERT(hparams.n_merge > 0);
+        cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
+        cur = build_patch_merge_permute(cur, hparams.n_merge);
+        cb(cur, "after_patch_merger", -1);
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr, // no gate
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
+        cb(cur, "after_projector", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
@@ -73,6 +73,11 @@ struct clip_graph_paddleocr : clip_graph {
    ggml_cgraph * build() override;
 };

+struct clip_graph_dotsocr : clip_graph {
+    clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
 struct clip_graph_cogvlm : clip_graph {
    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
@@ -375,6 +375,13 @@ struct mtmd_context {
                    img_end = "<|im_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
                } break;
+            case PROJECTOR_TYPE_DOTS_OCR:
+                {
+                    // <|img|> ... (image embeddings) ... <|endofimg|>
+                    img_beg = "<|img|>";
+                    img_end = "<|endofimg|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
            case PROJECTOR_TYPE_NEMOTRON_V2_VL:
                {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
@@ -89,6 +89,7 @@ add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
 add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
 add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
+add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
 add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"

 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
@@ -18,7 +18,7 @@
 		<div style="display: contents">
 			<script>
 				{
-					__sveltekit_nl4lme = {
+					__sveltekit_6n4hpv = {
 						base: new URL('.', location).pathname.slice(0, -1)
 					};

@@ -62,10 +62,14 @@
 		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
 	);

+	let lastSyncedConversationModel: string | null = null;
+
 	$effect(() => {
-		if (conversationModel) {
+		if (conversationModel && conversationModel !== lastSyncedConversationModel) {
+			lastSyncedConversationModel = conversationModel;
 			modelsStore.selectModelByName(conversationModel);
 		} else if (isRouter && !modelsStore.selectedModelId && modelsStore.loadedModelIds.length > 0) {
+			lastSyncedConversationModel = null;
 			// auto-select the first loaded model only when nothing is selected yet
 			const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model));
 			if (first) modelsStore.selectModelById(first.id);
@@ -77,6 +77,11 @@
 			!modelsStore.isModelLoaded(modelsStore.selectedModelName)
 		) {
 			modelsStore.clearSelection();
+
+			const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model));
+			if (first) {
+				await modelsStore.selectModelById(first.id);
+			}
 		}

 		// Handle URL params only if we have ?q= or ?model= or ?new_chat=true
Author	SHA1	Message	Date
Xuan-Son Nguyen	057dba336e	model: fix multimodal padding token for gemma3n/gemma4 (#21625 ) * model: fix multimodal padding token for gemma3n/gemma4 * nits	2026-04-09 12:18:23 +02:00
Xuan-Son Nguyen	501aeed18f	mtmd: support dots.ocr (#17575 ) * convert gguf * clip impl * fix conversion * wip * corrections * update docs * add gguf to test script	2026-04-09 12:16:38 +02:00
Piotr Wilkin (ilintar)	0ec191e1d7	vocab: add gemma4 tokenizer tests, fix edge case (#21534 ) * YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! * Remove unnecessary hash from update script. * minor: move constant	2026-04-09 11:41:14 +02:00
Kwa Jie Hao	243532e556	jinja : support ensure_ascii=true, string repetition and int/float self-filtering (#21623 ) * feat: jinja engine improvements for reka-edge Port three Jinja engine improvements needed for the reka-edge model: 1. Python-style string repetition ("ab" * 3 → "ababab") 2. ensure_ascii=true support for tojson filter (escapes non-ASCII to \uXXXX) 3. int() builtin on value_int_t (identity, needed for Reka Edge template) * fix: escape invalid utf8 bytes when ensure_ascii=true The json_ensure_ascii_preserving_format function does not correctly handle an edge case where if UTF-8 parsing fails, it adds the non-ascii character back to the output as a raw byte. This commit fixes that by adding the unicode standard replacement character \\ufffd to the output instead. This is the standard behavior for various programming languages like Python, Rust, Go, etc. * chore: address PR comments 1. Add todo comment for supporting string repetition for array/tuples 2. Add support for float identity operation 3. Move invalid ascii test case to test_fuzzing * chore: accept suggestion for common/jinja/value.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-09 11:28:33 +02:00
Georgi Gerganov	5e9c635463	metal : add missing mm-id specializations for q1_0 (#21662 )	2026-04-09 10:54:00 +03:00
Aleksander Grygier	9949ad08f6	fix: Model Selector choice sync (#21628 )	2026-04-09 09:46:27 +02:00