Compare commits

...

10 Commits

Author SHA1 Message Date
Samanvya Tripathi af5c13841f common : fix tool call type detection for nullable and enum schemas (#21327)
* common : fix tool call type detection for nullable and enum schemas

* common, tests : fix grammar delegation for nullable/enum schemas and add tests

Fix enum type inference to scan all enum values (not just index 0) so
schemas like {"enum": [0, "celsius"]} correctly detect string type.

Fix schema_delegates in peg-parser to handle nullable type arrays
(["string", "null"]) and typeless enum schemas in raw mode, allowing
the tagged parser to use raw text instead of JSON-formatted strings.

Add test cases for Qwen3-Coder (TAG_WITH_TAGGED format):
- nullable string ["string", "null"]
- nullable string with null first ["null", "string"]
- nullable integer ["integer", "null"]
- enum without explicit type key
2026-04-03 17:51:23 +02:00
M1DNYT3 277ff5fff7 docker : bump cuda12 to 12.9.1 (#20920)
Co-authored-by: M1DNYT3 <m1dnyt3@MacBookPro.lan>
Co-authored-by: CISC <CISC@users.noreply.github.com>
2026-04-03 15:06:45 +02:00
jeromew 384c0076bc docs: Update build.md: HSA_OVERRIDE_GFX_VERSION clarification (#21331)
The `HSA_OVERRIDE_GFX_VERSION` variable can be used in ROCm to override an unsupported target architecture with a similar but supported target architecture.

This does not and has never worked on Windows. I think the clarification could avoid driving Windows people towards this solution that does not work.
2026-04-03 21:05:14 +08:00
Sigbjørn Skjæret 1f34806c44 jinja: coerce input for string-specific filters (#21370) 2026-04-03 15:03:33 +02:00
Aaron Teo 887535c33f ci: add more binary checks (#21349) 2026-04-03 20:50:00 +08:00
Piotr Wilkin (ilintar) d3416a4aa9 fix: remove stale assert (#21369) 2026-04-03 13:40:41 +02:00
uvos 43a4ee4a2c HIP: build eatch ci build test for a different architecture (#21337)
This helps improve our chances of finding build failures before the release workflow
builds for all architectures.
2026-04-03 11:38:22 +02:00
Tillerino f851fa5ab0 fix: add openssl to nix dependencies (#21353) (#21355) 2026-04-03 12:21:07 +03:00
Vishal Singh f1ac84119c ggml-zendnn : add MUL_MAT_ID op support for MoE models (#21315)
* ggml-zendnn : add MUL_MAT_ID op support for MoE models
- Add MUL_MAT_ID op acceleration for Mixture-of-Experts models
- MUL_MAT_ID op fallback to CPU backend if total experts > 32
- Point ZenDNN lib to latest bits ZenDNN-2026-WW13

* ggml-zendnn : add braces to sgemm failure condition for consistency

Co-authored-by: Aaron Teo <taronaeo@gmail.com>

---------

Co-authored-by: Aaron Teo <taronaeo@gmail.com>
2026-04-03 12:19:08 +03:00
Piotr Wilkin (ilintar) b069b10ab4 vocab: fix Gemma4 tokenizer (#21343)
* seems to work

* fix case with new line

Co-authored-by: sayap <sokann@gmail.com>

* gemma 4: fix pre tok regex

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: sayap <sokann@gmail.com>
2026-04-03 10:33:03 +02:00
23 changed files with 3295 additions and 7363 deletions
-97
View File
@@ -1,97 +0,0 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=13.1.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app
COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so*" -exec cp -P {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]
+3 -2
View File
@@ -16,7 +16,7 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
curl,
openssl,
shaderc,
useBlas ?
builtins.all (x: !x) [
@@ -160,7 +160,8 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ [ openssl ];
cmakeFlags =
[
+2
View File
@@ -472,6 +472,7 @@ jobs:
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGPU_TARGETS="gfx1030" \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
@@ -990,6 +991,7 @@ jobs:
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGPU_TARGETS="gfx1100" `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+4 -4
View File
@@ -73,10 +73,10 @@ jobs:
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
+1 -1
View File
@@ -59,7 +59,7 @@ jobs:
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGPU_TARGETS=gfx908 \
-DGPU_TARGETS=gfx942 \
-DGGML_HIP=ON \
-DGGML_HIP_EXPORT_METRICS=Off \
-DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
+30 -2
View File
@@ -221,7 +221,7 @@ function gg_run_ctest_debug {
set -e
# Check cmake and ctest are installed
# Check required binaries are installed
gg_check_build_requirements
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
@@ -252,7 +252,7 @@ function gg_run_ctest_release {
set -e
# Check cmake and ctest are installed
# Check required binaries are installed
gg_check_build_requirements
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
@@ -627,10 +627,38 @@ function gg_sum_rerank_tiny {
}
function gg_check_build_requirements {
if ! command -v git &> /dev/null; then
gg_printf 'git not found, please install'
fi
if ! command -v git-lfs &> /dev/null; then
gg_printf 'git-lfs not found, please install'
fi
if ! command -v wget &> /dev/null; then
gg_printf 'wget not found, please install'
fi
if ! command -v python3 &> /dev/null; then
gg_printf 'python3 not found, please install'
fi
if ! command -v pip3 &> /dev/null; then
gg_printf 'pip3 not found, please install'
fi
if ! python3 -m ensurepip --help &> /dev/null; then
gg_printf 'ensurepip not found, please install python3-venv package'
fi
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
fi
if ! command -v ccache &> /dev/null; then
gg_printf 'ccache not found, please consider installing for faster builds'
fi
if ! command -v ctest &> /dev/null; then
gg_printf 'ctest not found, please install'
fi
+55 -9
View File
@@ -400,12 +400,34 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
for (const auto & [param_name, param_schema] : properties.items()) {
bool is_required = required.find(param_name) != required.end();
std::string type = "object";
auto type_obj = param_schema.contains("type") ? param_schema.at("type") : json::object();
if (type_obj.is_string()) {
type_obj.get_to(type);
} else if (type_obj.is_object()) {
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
type_obj.at("type").get_to(type);
if (param_schema.contains("type")) {
const auto & type_obj = param_schema.at("type");
if (type_obj.is_string()) {
type_obj.get_to(type);
} else if (type_obj.is_array()) {
// Handle nullable types like ["string", "null"]
for (const auto & t : type_obj) {
if (t.is_string() && t.get<std::string>() != "null") {
type = t.get<std::string>();
break;
}
}
} else if (type_obj.is_object()) {
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
type_obj.at("type").get_to(type);
}
}
}
// Infer string type from enum values when type is unspecified
if (type == "object" && param_schema.contains("enum")) {
const auto & enum_vals = param_schema.at("enum");
if (enum_vals.is_array()) {
for (const auto & v : enum_vals) {
if (v.is_string()) {
type = "string";
break;
}
}
}
}
@@ -574,9 +596,33 @@ common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_
std::vector<arg_entry> arg_entries;
for (const auto & [param_name, param_schema] : properties.items()) {
std::string type = "object";
auto type_v = param_schema.contains("type") ? param_schema.at("type") : json::object();
if (type_v.is_string()) type_v.get_to(type);
std::string type = "object";
if (param_schema.contains("type")) {
const auto & type_v = param_schema.at("type");
if (type_v.is_string()) {
type_v.get_to(type);
} else if (type_v.is_array()) {
// Handle nullable types like ["string", "null"]
for (const auto & t : type_v) {
if (t.is_string() && t.get<std::string>() != "null") {
type = t.get<std::string>();
break;
}
}
}
}
// Infer string type from enum values when type is unspecified
if (type == "object" && param_schema.contains("enum")) {
const auto & enum_vals = param_schema.at("enum");
if (enum_vals.is_array()) {
for (const auto & v : enum_vals) {
if (v.is_string()) {
type = "string";
break;
}
}
}
}
common_peg_parser value_parser = p.eps();
if (type == "string") {
+13
View File
@@ -306,6 +306,19 @@ value filter_expression::execute_impl(context & ctx) {
filter_id = "strip"; // alias
}
JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
// TODO: Refactor filters so this coercion can be done automatically
if (!input->is_undefined() && !is_val<value_string>(input) && (
filter_id == "capitalize" ||
filter_id == "lower" ||
filter_id == "replace" ||
filter_id == "strip" ||
filter_id == "title" ||
filter_id == "upper" ||
filter_id == "wordcount"
)) {
JJ_DEBUG("Coercing %s to String for '%s' filter", input->type().c_str(), filter_id.c_str());
input = mk_val<value_string>(input->as_string());
}
return try_builtin_func(ctx, filter_id, input)->invoke(func_args(ctx));
} else if (is_stmt<call_expression>(filter)) {
+16 -16
View File
@@ -465,8 +465,9 @@ const func_builtins & value_int_t::get_builtins() const {
double val = static_cast<double>(args.get_pos(0)->as_int());
return mk_val<value_float>(val);
}},
{"tojson", tojson},
{"safe", tojson},
{"string", tojson},
{"tojson", tojson},
};
return builtins;
}
@@ -485,8 +486,9 @@ const func_builtins & value_float_t::get_builtins() const {
int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
return mk_val<value_int>(val);
}},
{"tojson", tojson},
{"safe", tojson},
{"string", tojson},
{"tojson", tojson},
};
return builtins;
}
@@ -771,6 +773,11 @@ const func_builtins & value_string_t::get_builtins() const {
const func_builtins & value_bool_t::get_builtins() const {
static const func_handler tostring = [](const func_args & args) -> value {
args.ensure_vals<value_bool>();
bool val = args.get_pos(0)->as_bool();
return mk_val<value_string>(val ? "True" : "False");
};
static const func_builtins builtins = {
{"default", default_value},
{"int", [](const func_args & args) -> value {
@@ -783,11 +790,8 @@ const func_builtins & value_bool_t::get_builtins() const {
bool val = args.get_pos(0)->as_bool();
return mk_val<value_float>(val ? 1.0 : 0.0);
}},
{"string", [](const func_args & args) -> value {
args.ensure_vals<value_bool>();
bool val = args.get_pos(0)->as_bool();
return mk_val<value_string>(val ? "True" : "False");
}},
{"safe", tostring},
{"string", tostring},
{"tojson", tojson},
};
return builtins;
@@ -1100,18 +1104,14 @@ const func_builtins & value_object_t::get_builtins() const {
}
const func_builtins & value_none_t::get_builtins() const {
static const func_handler tostring = [](const func_args &) -> value {
return mk_val<value_string>("None");
};
static const func_builtins builtins = {
{"default", default_value},
{"tojson", tojson},
{"string", [](const func_args &) -> value {
return mk_val<value_string>("None");
}},
{"safe", [](const func_args &) -> value {
return mk_val<value_string>("None");
}},
{"strip", [](const func_args &) -> value {
return mk_val<value_string>("None");
}},
{"string", tostring},
{"safe", tostring},
{"items", empty_value_fn<value_array>},
{"map", empty_value_fn<value_array>},
{"reject", empty_value_fn<value_array>},
+17 -1
View File
@@ -1561,7 +1561,23 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
if (!s.schema) {
return true;
}
if (s.raw && s.schema->contains("type") && s.schema->at("type").is_string() && s.schema->at("type") == "string") {
if (s.raw && s.schema->contains("type")) {
const auto & type_val = s.schema->at("type");
if (type_val.is_string() && type_val == "string") {
return true;
}
// Handle nullable types like ["string", "null"] - delegate when the
// non-null type is string, since the tagged format uses raw text
if (type_val.is_array()) {
for (const auto & t : type_val) {
if (t.is_string() && t.get<std::string>() != "null") {
return t.get<std::string>() == "string";
}
}
}
}
// Delegate for enum schemas in raw mode - enum values are literal strings
if (s.raw && !s.schema->contains("type") && s.schema->contains("enum")) {
return true;
}
return false;
-3
View File
@@ -7464,9 +7464,6 @@ class Gemma4Model(Gemma3Model):
assert len(tokens) == vocab.vocab_size
# TODO @ngxson : there are some known (rare) issues with the tokenizer during development
# but I don't have time to dive into them right now;
# using a dedicated tokenizer name so that we can fix later without re-converting GGUF
self.gguf_writer.add_tokenizer_model("gemma4")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
+5 -4
View File
@@ -57,13 +57,14 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based
## Supported Operations
The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-based matrix multiplication (MUL_MAT_ID)** operations. Other operations are handled by the standard CPU backend.
| Operation | Status | Notes |
|:-------------|:-------:|:----------------------------------------------:|
| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul |
| MUL_MAT_ID | Support | Accelerated via ZenDNN LowOHA MatMul (MoE) |
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
*Note:* Since MUL_MAT and MUL_MAT_ID are accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs and Mixture-of-Experts models).
## DataType Supports
@@ -181,7 +182,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
## Known Issues
- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
@@ -216,4 +217,4 @@ Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-t
## TODO
- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
- Expand operation support beyond MUL_MAT and MUL_MAT_ID (attention operations, activations, etc.)
+1 -1
View File
@@ -389,7 +389,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. Note that [`HSA_OVERRIDE_GFX_VERSION`] is [not supported on Windows](https://github.com/ROCm/ROCm/issues/2654)
### Unified Memory
+1 -1
View File
@@ -68,7 +68,7 @@ Legend:
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | | ❌ |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+2773 -7213
View File
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
ExternalProject_Add(
zendnn
GIT_REPOSITORY https://github.com/amd/ZenDNN.git
GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08
GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e # ZenDNN-2026-WW13
PREFIX ${ZENDNN_PREFIX}
SOURCE_DIR ${ZENDNN_SOURCE_DIR}
BINARY_DIR ${ZENDNN_BUILD_DIR}
+179
View File
@@ -190,6 +190,170 @@ static void ggml_zendnn_compute_forward_mul_mat(
}
}
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
static void ggml_zendnn_compute_forward_mul_mat_id(
ggml_backend_zendnn_context * ctx,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // expert weights
const ggml_tensor * src1 = dst->src[1]; // inputs
const ggml_tensor * ids = dst->src[2]; // expert ids
GGML_TENSOR_BINARY_OP_LOCALS
// exit for no tokens to process
if (ne2 == 0 || ne11 == 0) {
return;
}
ggml_type const vec_dot_type = src0->type;
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne03 == 1);
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(ne3 == 1);
// row groups
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_experts
std::vector<int64_t> matrix_row_counts(n_as, 0);
std::vector<std::vector<mmid_row_mapping>> matrix_rows(n_as);
int64_t max_rows = 0;
// group rows by expert (preprocessing step)
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
for (int id = 0; id < n_ids; ++id) {
const int32_t i02 = *(const int32_t *)((const char *)ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
matrix_rows[i02].push_back({id, iid1});
matrix_row_counts[i02]++;
if (matrix_row_counts[i02] > max_rows) {
max_rows = matrix_row_counts[i02];
}
}
}
if (max_rows == 0) {
return; // no rows to process
}
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
// size for converting src1 rows to vec_dot_type if needed
const size_t nbw1 = row_size;
const size_t nbw2 = nbw1 * ne11;
const size_t nbw3 = nbw2 * ne12;
const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
// size for MoE gather/scatter buffers
const size_t wdata_cur_size = max_rows * row_size;
const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
// allocate single buffer for all needs
const size_t total_size = src1_conv_size + wdata_cur_size + dst_cur_size;
if (ctx->work_size < total_size) {
ctx->work_data.reset(new char[total_size]);
ctx->work_size = total_size;
}
// partition the buffer
char * work_data = ctx->work_data.get();
char * wdata_cur = work_data + src1_conv_size;
char * dst_cur = wdata_cur + wdata_cur_size;
if (src1->type != vec_dot_type) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
#pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
const float * src1_f32 = (float *)((char *)src1->data + i11*nb11 + i12*nb12 + i13*nb13);
void * src1_conv = (char *)work_data + i11*nbw1 + i12*nbw2 + i13*nbw3;
from_float(src1_f32, src1_conv, ne10);
}
}
}
}
const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
// process each expert with gather -> gemm -> scatter pattern
for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
if (cne1 == 0) {
continue;
}
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
// gather input rows for this expert
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
const int64_t id = row_mapping.i1;
const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2;
std::memcpy(
wdata_cur + ir1 * row_size,
(const char *) wdata + (i11 + i12*ne11) * row_size,
row_size
);
}
// batched gemm for all tokens in this expert
if (!ggml_zendnn_sgemm(ctx,
ne01, // m
cne1, // n
ne10, // k
src0_cur,
ne00, // lda
wdata_cur,
ne10, // ldb
dst_cur,
ne01, // ldc
src0->type,
vec_dot_type,
dst->type)) {
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
}
// scatter output rows to destination
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
const int64_t id = row_mapping.i1;
const int64_t i1 = id;
const int64_t i2 = row_mapping.i2;
std::memcpy(
(char *) dst->data + i1*nb1 + i2*nb2,
dst_cur + ir1 * ggml_row_size(dst->type, ne01),
ggml_row_size(dst->type, ne01)
);
}
}
}
// backend interface
static const char * ggml_backend_zendnn_get_name(ggml_backend_t backend) {
@@ -218,6 +382,9 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
case GGML_OP_MUL_MAT:
ggml_zendnn_compute_forward_mul_mat(ctx, node);
break;
case GGML_OP_MUL_MAT_ID:
ggml_zendnn_compute_forward_mul_mat_id(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
@@ -361,6 +528,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
return true;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
const ggml_tensor * weights = op->src[0];
const ggml_tensor * inputs = op->src[1];
@@ -374,6 +542,17 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) {
return false;
}
// MUL_MAT_ID performs best with a moderate number of experts due to its
// gather + batched matmul + scatter approach. Future versions will leverage
// ZenDNN's grouped_gemm for better scalability with larger expert counts:
// https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_group_gemm_operator.md
if (op->op == GGML_OP_MUL_MAT_ID) {
const int64_t n_experts = weights->ne[2];
const int64_t max_experts = 32;
if (n_experts > max_experts) {
return false;
}
}
switch (weights->type) {
case GGML_TYPE_F32:
case GGML_TYPE_BF16:
+61 -5
View File
@@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
// Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
// normalizer, then BPE merges run on the whole text without
// word-level pre-splitting. We only need to split on newlines
// since BPE merge lookup asserts no newlines in tokens.
regex_exprs = {
"[^\\n]+|[\\n]+",
};
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
}
std::vector<std::string> regex_exprs;
bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
};
struct llm_tokenizer_bpe_session {
@@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
symbols_final.clear();
auto tok_pre = vocab.get_pre_type();
for (const auto & word : word_collection) {
work_queue = llm_bigram_bpe::queue();
@@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
} else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
// fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
auto tok = vocab.text_to_token(word);
if (tok != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
}
}
while (offset < word.size()) {
@@ -1864,7 +1883,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_pad_id = 3; // <|plamo:pad|>
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "gemma4") {
type = LLAMA_VOCAB_TYPE_SPM;
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
{
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
std::string first;
std::string second;
const size_t pos = word.find(' ', 1);
if (pos != std::string::npos) {
first = word.substr(0, pos);
second = word.substr(pos + 1);
}
bpe_ranks.emplace(std::make_pair(first, second), i);
}
}
// default special tokens (to be read from GGUF)
special_bos_id = LLAMA_TOKEN_NULL;
@@ -1874,7 +1917,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
tokenizer_pre = "gemma4";
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
@@ -1882,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// for now, only BPE models have pre-tokenizers
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
escape_whitespaces = false;
clean_spaces = true;
if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@@ -1948,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if (
tokenizer_pre == "jais-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
} else if (
tokenizer_pre == "gemma4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||
@@ -3045,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (escape_whitespaces) {
llama_escape_whitespace(text);
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
@@ -3224,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
if (escape_whitespaces) {
// SPM-style BPE: tokens contain ▁ for spaces
std::string result = token_text;
llama_unescape_whitespace(result);
return _try_copy(result.data(), result.size());
}
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}
@@ -3654,9 +3712,7 @@ int llama_vocab::max_token_len() const {
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
GGML_ASSERT(token_left.find('\n') == std::string::npos);
GGML_ASSERT(token_right.find(' ') == std::string::npos);
GGML_ASSERT(token_right.find('\n') == std::string::npos);
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == pimpl->bpe_ranks.end()) {
+1
View File
@@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
};
struct LLM_KV;
+6 -2
View File
@@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
return false;
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
{ "\\p{N}", unicode_cpt_flags::NUMBER },
@@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
start += offset;
}
return unicode_byte_encoding_process(bpe_words);
if (byte_encode) {
return unicode_byte_encoding_process(bpe_words);
}
return bpe_words;
}
+1 -1
View File
@@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
bool unicode_cpt_is_han(uint32_t cpt);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);
+113
View File
@@ -657,6 +657,66 @@ static common_chat_tool imaginary_number_tool{
})",
};
static common_chat_tool nullable_string_tool{
/* .name = */ "set_nullable_str",
/* .description = */ "Set a nullable string value",
/* .parameters = */ R"({
"type": "object",
"properties": {
"name": {
"type": ["string", "null"],
"description": "A nullable string"
}
},
"required": ["name"]
})",
};
static common_chat_tool nullable_string_null_first_tool{
/* .name = */ "set_nullable_str_nf",
/* .description = */ "Set a nullable string value with null first in type array",
/* .parameters = */ R"({
"type": "object",
"properties": {
"name": {
"type": ["null", "string"],
"description": "A nullable string with null first"
}
},
"required": ["name"]
})",
};
static common_chat_tool nullable_int_tool{
/* .name = */ "set_nullable_int",
/* .description = */ "Set a nullable integer value",
/* .parameters = */ R"({
"type": "object",
"properties": {
"count": {
"type": ["integer", "null"],
"description": "A nullable integer"
}
},
"required": ["count"]
})",
};
static common_chat_tool enum_no_type_tool{
/* .name = */ "set_unit",
/* .description = */ "Set a temperature unit",
/* .parameters = */ R"({
"type": "object",
"properties": {
"unit": {
"enum": ["celsius", "fahrenheit"],
"description": "Temperature unit"
}
},
"required": ["unit"]
})",
};
static common_chat_tool string_param_tool{
/* .name = */ "string_param",
/* .description = */ "Tool with string parameter for testing",
@@ -2200,6 +2260,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
}
})
.run();
}
{
@@ -2383,6 +2444,58 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
})
.expect_reconstruction()
.run();
// nullable string type ["string", "null"]
tst.test(
"<tool_call>\n"
"<function=set_nullable_str>\n"
"<parameter=name>\nhello world\n</parameter>\n"
"</function>\n"
"</tool_call>")
.tools({ nullable_string_tool })
.expect_tool_calls({
{ "set_nullable_str", R"({"name": "hello world"})", {} },
})
.run();
// nullable string with null first in type array ["null", "string"]
tst.test(
"<tool_call>\n"
"<function=set_nullable_str_nf>\n"
"<parameter=name>\nhello world\n</parameter>\n"
"</function>\n"
"</tool_call>")
.tools({ nullable_string_null_first_tool })
.expect_tool_calls({
{ "set_nullable_str_nf", R"({"name": "hello world"})", {} },
})
.run();
// nullable integer type ["integer", "null"] - should use JSON value path, not string
tst.test(
"<tool_call>\n"
"<function=set_nullable_int>\n"
"<parameter=count>\n42\n</parameter>\n"
"</function>\n"
"</tool_call>")
.tools({ nullable_int_tool })
.expect_tool_calls({
{ "set_nullable_int", R"({"count": 42})", {} },
})
.run();
// enum without explicit type key - should infer string from enum values
tst.test(
"<tool_call>\n"
"<function=set_unit>\n"
"<parameter=unit>\ncelsius\n</parameter>\n"
"</function>\n"
"</tool_call>")
.tools({ enum_no_type_tool })
.expect_tool_calls({
{ "set_unit", R"({"unit": "celsius"})", {} },
})
.run();
}
{
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
+12
View File
@@ -523,6 +523,18 @@ static void test_filters(testing & t) {
"hello"
);
test_template(t, "upper array",
"{{ items|upper }}",
{{"items", json::array({"hello", "world"})}},
"['HELLO', 'WORLD']"
);
test_template(t, "upper dict",
"{{ items|upper }}",
{{"items", {{"hello", "world"}}}},
"{'HELLO': 'WORLD'}"
);
test_template(t, "capitalize",
"{{ 'heLlo World'|capitalize }}",
json::object(),