Compare commits

...

6 Commits

Author SHA1 Message Date
Acly 10640e31aa ggml : fix interpolate with align-corners and ne=1 (#16700)
* ggml : fix interpolate with align-corners and ne=1

* avoid division by zero if one of the spatial dimensions is 1
* cpu, cuda, opencl returned correct result anyway due to clamp
* vulkan didn't clamp for align-corners so results were broken

* fix clang warning
2025-10-27 21:50:22 +01:00
Johannes Gäßler 80d28f104c HIP: fix AMDGPU_TARGETS, update documentation (#16803) 2025-10-27 21:39:49 +01:00
Xuan-Son Nguyen c55d53acec model : add LightOnOCR-1B model (#16764)
* model : add LightOnOCR-1B model

* add test
2025-10-27 16:02:58 +01:00
Johannes Gäßler 945501f5ea llama: fix leaked buffers for mmap + split files (#16765) 2025-10-27 09:17:31 +01:00
Aman Gupta 75cbdd3fce test-backend-ops: print failed tests at the end (#16785) 2025-10-27 09:25:10 +08:00
tamarPal 2b9bd9bf4e sycl: add ROLL operation support (#16665)
* sycl: add ROLL operation support

- Implement ggml_sycl_roll function for F32 tensors
- Add multi-axis roll operation with SYCL kernel
- Support all 4 tensor dimensions with proper shift normalization
- Add roll.cpp and roll.hpp to SYCL backend
- Update backend dispatch and supports_op for GGML_OP_ROLL
- Tests: 17662/17662 pass with identical CPU reference results

* fix: remove trailing whitespace from roll.cpp

- Fix EditorConfig violations in ggml/src/ggml-sycl/roll.cpp
- Remove trailing spaces from lines 6, 11, 28, 47, 58, 60

* ci: retrigger

* sycl: remove wait() calls from ROLL operation

* fix: editorconfig — LF endings + final newline for roll.hpp

---------

Co-authored-by: tamarPal <tamarPal@example.com>
2025-10-27 09:20:24 +08:00
19 changed files with 301 additions and 70 deletions
+24 -2
View File
@@ -2460,18 +2460,21 @@ class ArceeModel(LlamaModel):
)
class LlavaVisionModel(MmprojModel):
img_break_tok_id = -1
use_break_tok = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams.get("model_type") == "pixtral":
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
if self.use_break_tok:
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
elif self.is_mistral_format:
# hparams is already vision config here so norm_eps is only defined in global_config.
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
if self.use_break_tok:
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
else:
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -3962,6 +3965,10 @@ class Qwen3Model(Qwen2Model):
return torch.stack([true_row, false_row], dim=0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "model.vision_" in name:
# skip multimodal tensors
return []
if self.is_rerank:
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
is_real_head = not self.is_tied_embeddings and "lm_head" in name
@@ -9435,6 +9442,21 @@ class PixtralModel(LlavaVisionModel):
return super().map_tensor_name(name, try_suffixes)
@ModelBase.register("LightOnOCRForConditionalGeneration")
class LightOnOCRVisionModel(LlavaVisionModel):
is_mistral_format = False
use_break_tok = False
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("model.vision_encoder.", "vision_tower.")
name = name.replace("model.vision_projection.", "multi_modal_projector.")
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("KimiVLForConditionalGeneration")
class KimiVLModel(MmprojModel):
def __init__(self, *args, **kwargs):
+6 -4
View File
@@ -261,10 +261,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system.
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
@@ -282,17 +284,17 @@ You can download it from your Linux distro's package manager or from here: [ROCm
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake -S . -B build -G Ninja -DGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
If necessary, adapt `GPU_TARGETS` to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
+2 -2
View File
@@ -7519,8 +7519,8 @@ static void ggml_compute_forward_upscale_f32(
float pixel_offset = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
pixel_offset = 0.0f;
sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
}
for (int64_t i3 = 0; i3 < ne3; i3++) {
+2 -2
View File
@@ -126,8 +126,8 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
float pixel_offset = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
sf0 = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
sf1 = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
pixel_offset = 0.0f;
}
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+3 -2
View File
@@ -29,10 +29,11 @@ if (CXX_IS_HIPCC)
endif()
else()
# Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
set(GPU_TARGETS ${AMDGPU_TARGETS})
endif()
if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
+2 -2
View File
@@ -6156,8 +6156,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
sf0 = (float)(ne0 - 1) / (ne00 - 1);
sf1 = (float)(ne1 - 1) / (ne01 - 1);
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
pixel_offset = 0.0f;
}
+1
View File
@@ -32,6 +32,7 @@
#include "pad.hpp"
#include "quantize.hpp"
#include "quants.hpp"
#include "roll.hpp"
#include "rope.hpp"
#include "set_rows.hpp"
#include "softmax.hpp"
+5
View File
@@ -3921,6 +3921,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_GATED_LINEAR_ATTN:
ggml_sycl_op_gated_linear_attn(ctx, dst);
break;
case GGML_OP_ROLL:
ggml_sycl_roll(ctx, dst);
break;
case GGML_OP_ARANGE:
ggml_sycl_arange(ctx, dst);
break;
@@ -4599,6 +4602,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_RWKV_WKV7:
case GGML_OP_GATED_LINEAR_ATTN:
return true;
case GGML_OP_ROLL:
return op->type == GGML_TYPE_F32;
case GGML_OP_ARANGE:
return op->type == GGML_TYPE_F32;
default:
+122
View File
@@ -0,0 +1,122 @@
#include "roll.hpp"
#include "common.hpp"
using namespace sycl;
static inline int wrap_add(int i, int shift, int n) {
int s = i + shift;
return (s >= n) ? (s - n) : s;
}
static void kernel_roll_fused_i0_i1(
queue &q,
const float *src_d,
float *dst_d,
int ne0, int ne1, int ne2, int ne3,
int sh0, int sh1, int sh2, int sh3)
{
if (ne0 == 0 || ne1 == 0 || ne2 == 0 || ne3 == 0) return;
const int stride1 = ne0;
const int stride2 = ne0 * ne1;
const int stride3 = ne0 * ne1 * ne2;
const int shNe0 = (ne0 - sh0) % ne0;
const int shNe1 = (ne1 - sh1) % ne1;
const int shNe2 = (ne2 - sh2) % ne2;
const int shNe3 = (ne3 - sh3) % ne3;
const size_t g0 = (size_t) ne3;
const size_t g1 = (size_t) ne2;
const size_t g2 = (size_t) (ne1 * ne0);
const range<3> global{ g0, g1, g2 };
q.submit([&](handler &h) {
h.parallel_for(global, [=](id<3> idx) {
const int i3 = (int) idx[0];
const int i2 = (int) idx[1];
const int fused = (int) idx[2];
const int i1 = fused / ne0;
const int i0 = fused - i1 * ne0; // fused % ne0
const int idx_dst = i0
+ i1 * stride1
+ i2 * stride2
+ i3 * stride3;
const int s0 = wrap_add(i0, shNe0, ne0);
const int s1 = wrap_add(i1, shNe1, ne1);
const int s2 = wrap_add(i2, shNe2, ne2);
const int s3 = wrap_add(i3, shNe3, ne3);
const int idx_src = s0
+ s1 * stride1
+ s2 * stride2
+ s3 * stride3;
dst_d[idx_dst] = src_d[idx_src];
});
});
}
void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const ggml_tensor *src = dst->src[0];
GGML_ASSERT(src && src->type == GGML_TYPE_F32);
const int ne0 = (int) dst->ne[0];
const int ne1 = (int) dst->ne[1];
const int ne2 = (int) dst->ne[2];
const int ne3 = (int) dst->ne[3];
const int32_t *params = (const int32_t *) dst->op_params;
int shift0 = params[0];
int shift1 = params[1];
int shift2 = params[2];
int shift3 = params[3];
if ((shift0 | shift1 | shift2 | shift3) == 0) {
const size_t nb = ggml_nbytes(src);
queue *q = ctx.stream();
SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb)));
return;
}
auto norm = [](int sh, int n) -> int {
if (n <= 0) return 0;
sh %= n;
if (sh < 0) sh += n;
return sh;
};
shift0 = norm(shift0, ne0);
shift1 = norm(shift1, ne1);
shift2 = norm(shift2, ne2);
shift3 = norm(shift3, ne3);
try {
queue *q = ctx.stream();
const float *src_d = (const float *) src->data;
float *dst_d = (float *) dst->data;
GGML_ASSERT(src_d && dst_d);
kernel_roll_fused_i0_i1(
*q, src_d, dst_d,
ne0, ne1, ne2, ne3,
shift0, shift1, shift2, shift3
);
} catch (const std::exception &e) {
std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what());
throw;
}
}
+20
View File
@@ -0,0 +1,20 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#ifndef GGML_SYCL_ROLL_HPP
#define GGML_SYCL_ROLL_HPP
#include "common.hpp"
void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
#endif // GGML_SYCL_ROLL_HPP
+19 -15
View File
@@ -523,7 +523,7 @@ struct vk_device_struct {
vk_pipeline pipeline_add_id_f32;
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_sqrt_f32;
@@ -1238,6 +1238,7 @@ struct vk_op_upscale_push_constants {
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
float sf0; float sf1; float sf2; float sf3;
float pixel_offset;
};
struct vk_op_sum_rows_push_constants
@@ -3493,7 +3494,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -7798,14 +7798,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return nullptr;
case GGML_OP_UPSCALE:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
int mode = ggml_get_op_params_i32(dst, 0);
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(dst, 0) & 0xFF);
switch (mode) {
case GGML_SCALE_MODE_NEAREST:
return ctx->device->pipeline_upscale_nearest_f32;
case GGML_SCALE_MODE_BILINEAR:
return ctx->device->pipeline_upscale_bilinear_f32;
case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
return ctx->device->pipeline_upscale_bilinear_ac_f32;
default:
return nullptr;
}
}
return nullptr;
@@ -9294,22 +9294,26 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
float sf0 = (float)dst->ne[0] / src0->ne[0];
float sf1 = (float)dst->ne[1] / src0->ne[1];
float sf2 = (float)dst->ne[2] / src0->ne[2];
float sf3 = (float)dst->ne[3] / src0->ne[3];
GGML_TENSOR_UNARY_OP_LOCALS
float sf0 = (float)ne0 / ne00;
float sf1 = (float)ne1 / ne01;
float sf2 = (float)ne2 / ne02;
float sf3 = (float)ne3 / ne03;
float pixel_offset = 0.5f;
if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
pixel_offset = 0.0f;
}
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
(uint32_t)ggml_nelements(dst), 0, 0,
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
sf0, sf1, sf2, sf3,
(uint32_t)ne00, (uint32_t)ne01,
(uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size,
(uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
sf0, sf1, sf2, sf3, pixel_offset
}, dryrun);
}
@@ -7,6 +7,7 @@ layout (push_constant) uniform parameter
uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13;
float sf0; float sf1; float sf2; float sf3;
float pixel_offset;
} p;
#include "types.glsl"
@@ -19,7 +20,6 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
#define NEAREST 0
#define BILINEAR 1
#define ALIGN_CORNERS (1 << 8)
layout (constant_id = 0) const uint scale_mode = 0;
@@ -52,7 +52,7 @@ float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
const ivec2 ne0 = ivec2(p.ne00, p.ne01);
const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5;
const vec2 c = (vec2(i10, i11) + p.pixel_offset) / vec2(p.sf0, p.sf1) - p.pixel_offset;
const vec2 c0f = floor(c);
const vec2 d = c - c0f;
const ivec2 c0 = max(ivec2(c0f), 0);
@@ -61,16 +61,6 @@ float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
return fetch_bilinear(c0, c1, d, i12, i13);
}
float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) {
const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1);
const vec2 c0f = floor(c);
const vec2 d = c - c0f;
const ivec2 c0 = ivec2(c0f);
const ivec2 c1 = c0 + 1;
return fetch_bilinear(c0, c1, d, i12, i13);
}
void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
@@ -91,9 +81,6 @@ void main() {
case BILINEAR:
result = interpolate_bilinear(i10, i11, i12, i13);
break;
case BILINEAR | ALIGN_CORNERS:
result = interpolate_bilinear_align_corners(i10, i11, i12, i13);
break;
}
data_d[p.d_offset + idx] = D_TYPE(result);
+1
View File
@@ -3062,6 +3062,7 @@ class VisionProjectorType:
VOXTRAL = "voxtral"
LFM2 = "lfm2"
KIMIVL = "kimivl"
LIGHTONOCR = "lightonocr"
# Items here are (block size, type size)
+16 -10
View File
@@ -15,7 +15,6 @@
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cfloat>
#include <cstring>
#include <cmath>
@@ -438,7 +437,7 @@ struct llama_model::impl {
llama_mlocks mlock_mmaps;
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
ggml_backend_buffer_t buf = nullptr;
std::vector<ggml_backend_buffer_ptr> bufs;
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
// only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
continue;
}
const size_t max_size = ggml_get_max_tensor_size(ctx);
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
bufs.emplace_back(buf);
buf_map.emplace(idx, buf);
}
}
else {
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
@@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
mlock_buf->init (ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
bufs.emplace_back(buf);
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
buf_map.emplace(idx, buf);
}
}
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
for (auto & buf : buf_map) {
// indicate that this buffer contains weights
@@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// print memory requirements per buffer type
for (auto & [_, buf] : pimpl->ctxs_bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
for (auto & buf: bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
}
}
// populate tensors_by_name
@@ -6300,8 +6304,10 @@ size_t llama_model::n_devices() const {
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
for (const auto & buf : bufs) {
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
}
}
return ret;
}
+45 -13
View File
@@ -511,7 +511,7 @@ struct test_result {
};
// Printer classes for different output formats
enum class test_status_t { NOT_SUPPORTED, OK, FAIL };
enum class test_status_t { NOT_SUPPORTED, OK, FAIL, SKIPPED };
struct test_operation_info {
std::string op_name;
@@ -687,6 +687,8 @@ struct printer {
virtual void print_backend_status(const backend_status_info & info) { (void) info; }
virtual void print_overall_summary(const overall_summary_info & info) { (void) info; }
virtual void print_failed_tests(const std::vector<std::string> & failed_tests) { (void) failed_tests; }
};
struct console_printer : public printer {
@@ -804,6 +806,17 @@ struct console_printer : public printer {
}
}
void print_failed_tests(const std::vector<std::string> & failed_tests) override {
if (failed_tests.empty()) {
return;
}
printf("\nFailing tests:\n");
for (const auto & test_name : failed_tests) {
printf(" %s\n", test_name.c_str());
}
}
private:
void print_test_console(const test_result & result) {
printf(" %s(%s): ", result.op_name.c_str(), result.op_params.c_str());
@@ -1056,6 +1069,8 @@ struct test_case {
std::vector<ggml_tensor *> sentinels;
std::string current_op_name;
void add_sentinel(ggml_context * ctx) {
if (mode == MODE_PERF || mode == MODE_GRAD || mode == MODE_SUPPORT) {
return;
@@ -1127,7 +1142,10 @@ struct test_case {
}
}
bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_names_filter, printer * output_printer) {
test_status_t eval(ggml_backend_t backend1,
ggml_backend_t backend2,
const char * op_names_filter,
printer * output_printer) {
mode = MODE_TEST;
ggml_init_params params = {
@@ -1144,11 +1162,12 @@ struct test_case {
add_sentinel(ctx);
ggml_tensor * out = build_graph(ctx);
std::string current_op_name = op_desc(out);
current_op_name = op_desc(out);
if (!matches_filter(out, op_names_filter)) {
//printf(" %s: skipping\n", op_desc(out).c_str());
ggml_free(ctx);
return true;
return test_status_t::SKIPPED;
}
// check if the backends support the ops
@@ -1172,7 +1191,7 @@ struct test_case {
}
ggml_free(ctx);
return true;
return test_status_t::NOT_SUPPORTED;
}
// post-graph sentinel
@@ -1184,7 +1203,7 @@ struct test_case {
if (buf == NULL) {
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
ggml_free(ctx);
return false;
return test_status_t::FAIL;
}
// build graph
@@ -1289,7 +1308,7 @@ struct test_case {
output_printer->print_test_result(result);
}
return test_passed;
return test_passed ? test_status_t::OK : test_status_t::FAIL;
}
bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
@@ -1306,7 +1325,7 @@ struct test_case {
GGML_ASSERT(ctx);
ggml_tensor * out = build_graph(ctx.get());
std::string current_op_name = op_desc(out);
current_op_name = op_desc(out);
if (!matches_filter(out, op_names_filter)) {
//printf(" %s: skipping\n", op_desc(out).c_str());
return true;
@@ -1435,8 +1454,9 @@ struct test_case {
ggml_context_ptr ctx(ggml_init(params)); // smart ptr
GGML_ASSERT(ctx);
ggml_tensor * out = build_graph(ctx.get());
std::string current_op_name = op_desc(out);
ggml_tensor * out = build_graph(ctx.get());
current_op_name = op_desc(out);
if (!matches_filter(out, op_names_filter)) {
return true;
}
@@ -7029,6 +7049,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5, 7, 11}, mode));
}
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
test_cases.emplace_back(new test_sum());
test_cases.emplace_back(new test_sum_rows());
@@ -7360,16 +7382,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
size_t n_ok = 0;
size_t tests_run = 0;
std::vector<std::string> failed_tests;
for (auto & test : test_cases) {
if (test->eval(backend, backend_cpu, op_names_filter, output_printer)) {
test_status_t status = test->eval(backend, backend_cpu, op_names_filter, output_printer);
if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
continue;
}
tests_run++;
if (status == test_status_t::OK) {
n_ok++;
} else if (status == test_status_t::FAIL) {
failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
}
}
output_printer->print_summary(test_summary_info(n_ok, test_cases.size(), false));
output_printer->print_summary(test_summary_info(n_ok, tests_run, false));
output_printer->print_failed_tests(failed_tests);
ggml_backend_free(backend_cpu);
return n_ok == test_cases.size();
return n_ok == tests_run;
}
if (mode == MODE_GRAD) {
+2
View File
@@ -139,6 +139,7 @@ enum projector_type {
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -161,6 +162,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
+23 -3
View File
@@ -621,7 +621,7 @@ struct clip_graph {
}
// arrangement of the [IMG_BREAK] token
{
if (model.token_embd_img_break) {
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
@@ -2095,6 +2095,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
res = graph.build_siglip();
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
res = graph.build_pixtral();
} break;
@@ -2380,6 +2381,7 @@ struct clip_model_loader {
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
@@ -2722,6 +2724,15 @@ struct clip_model_loader {
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
} break;
case PROJECTOR_TYPE_LIGHTONOCR:
{
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3622,7 +3633,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->entries.push_back(std::move(img_f32));
return true;
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
|| ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
) {
clip_image_u8 resized_image;
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -3865,12 +3878,17 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// dynamic size
int n_merge = params.spatial_merge_size;
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
if (ctx->model.token_embd_img_break) {
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} else {
n_patches = n_patches_y * n_patches_x;
}
} break;
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
@@ -4247,6 +4265,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
@@ -4377,6 +4396,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_peg_0_b->ne[0];
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
+5
View File
@@ -275,6 +275,11 @@ struct mtmd_context {
img_beg = "<img>";
img_end = "</img>";
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
// <|im_start|> ... (image embeddings) ... <|im_end|>
img_beg = "<|im_start|>";
img_end = "<|im_end|>";
}
}
+1
View File
@@ -70,6 +70,7 @@ add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"