Compare commits

...

10 Commits

Author SHA1 Message Date
Xuan-Son Nguyen df8491922f ggml : add mrope kernel for metal (#13457) 2025-05-12 10:29:13 +02:00
Atharva Dubey 14492144c2 enable dpcpp nightly builds with libraries (#13406) 2025-05-12 13:15:32 +08:00
City c104023994 mtmd : Use RMS norm for InternVL 3 38B and 78B mmproj (#13459) 2025-05-12 00:39:06 +02:00
Anthony Umfer 9a390c4829 tools : fix uninitialized llama_batch in server (#13436)
* add constructor to initialize server_context::batch, preventing destructor's call to llama_batch_free from causing an invalid free()

* Update tools/server/server.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

* use C++11 initializer syntax

* switch from Copy-list-initialization to Direct-list-initialization

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-05-11 17:08:26 +02:00
Sigbjørn Skjæret 09232370fc scripts : exit compare-llama-bench.py gracefully when there's nothing to compare (#13451) 2025-05-11 16:20:39 +02:00
Johannes Gäßler 7474e00b34 CUDA: fix crash with partial offloading of MoE (#13439) 2025-05-11 16:09:33 +02:00
David Huang 7f323a589f Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386) 2025-05-11 14:18:39 +02:00
City 3eac209319 mtmd : support InternVL 3 38B and 78B mmproj (#13443)
* Support InternVL 3 38B and 78B mmproj

* Swap norms in clip.cpp

* Group variables together
2025-05-11 11:35:52 +02:00
Xuan-Son Nguyen a634d75d1b mtmd : move helpers to dedicated file (#13442)
* mtmd : move helpers to dedicated file

* fix windows build

* rm redundant include
2025-05-11 11:34:23 +02:00
Thomas Germer 62d4250e52 docs : Fix typo in InternVL3 model name (#13440) 2025-05-10 22:26:46 +02:00
28 changed files with 637 additions and 350 deletions
+7
View File
@@ -2437,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
));
add_opt(common_arg(
{"--no-op-offload"},
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
[](common_params & params) {
params.no_op_offload = true;
}
));
add_opt(common_arg(
{"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)",
+1
View File
@@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
if (params.reranking) {
cparams.embeddings = true;
+1
View File
@@ -332,6 +332,7 @@ struct common_params {
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool single_turn = false; // single turn chat conversation
+2 -2
View File
@@ -69,9 +69,9 @@ NOTE: some models may require large context window, for example: `-c 8192`
# InternVL 2.5 and 3
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
(tool_name) -hf ggml-org/InternVL2_5-2B-GGUF
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-4B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
```
+2 -2
View File
@@ -248,7 +248,7 @@ extern "C" {
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
// initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
+6 -2
View File
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
char * context_buffer;
size_t context_buffer_size;
bool op_offload;
int debug;
};
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_buffer_type_t * bufts,
int n_backends,
size_t graph_size,
bool parallel) {
bool parallel,
bool op_offload) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
}
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
ggml_backend_sched_reset(sched);
+8 -2
View File
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
// Therefore, in such cases use cuBLAS.
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type)
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
+2 -2
View File
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
+2 -2
View File
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
+4
View File
@@ -207,6 +207,10 @@ typedef struct {
float attn_factor;
float beta_fast;
float beta_slow;
int32_t sect_0;
int32_t sect_1;
int32_t sect_2;
int32_t sect_3;
} ggml_metal_kargs_rope;
typedef struct {
+42 -16
View File
@@ -332,6 +332,10 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
@@ -1275,6 +1279,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16, mul_mm_id_iq4_xs_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, rope_multi_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16, rope_multi_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32, rope_vision_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16, rope_vision_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16, rope_neox_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
@@ -1637,16 +1645,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true;
}
return true;
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D:
@@ -3826,6 +3825,7 @@ static bool ggml_metal_encode_node(
} break;
case GGML_OP_ROPE:
{
// make sure we have one or more position id(ne10) per token(ne02)
GGML_ASSERT(ne10 % ne02 == 0);
GGML_ASSERT(ne10 >= ne02);
@@ -3852,20 +3852,42 @@ static bool ggml_metal_encode_node(
memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float));
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
// mrope
const int sect_0 = ((const int32_t *) dst->op_params)[11];
const int sect_1 = ((const int32_t *) dst->op_params)[12];
const int sect_2 = ((const int32_t *) dst->op_params)[13];
const int sect_3 = ((const int32_t *) dst->op_params)[14];
id<MTLComputePipelineState> pipeline = nil;
if (!is_neox) {
if (is_neox) {
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
default: GGML_ABORT("fatal error");
};
} else if (is_mrope && !is_vision) {
GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
default: GGML_ABORT("fatal error");
};
} else if (is_vision) {
GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
default: GGML_ABORT("fatal error");
};
} else {
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
default: GGML_ABORT("fatal error");
};
}
@@ -3896,6 +3918,10 @@ static bool ggml_metal_encode_node(
/*.attn_factor =*/ attn_factor,
/*.beta_fast =*/ beta_fast,
/*.beta_slow =*/ beta_slow,
/* sect_0 =*/ sect_0,
/* sect_1 =*/ sect_1,
/* sect_2 =*/ sect_2,
/* sect_3 =*/ sect_3,
};
[encoder setComputePipelineState:pipeline];
+146
View File
@@ -2713,8 +2713,148 @@ kernel void kernel_rope_neox(
}
}
template<typename T>
kernel void kernel_rope_multi(
constant ggml_metal_kargs_rope & args,
device const char * src0,
device const char * src1,
device const char * src2,
device char * dst,
ushort tiitg[[thread_index_in_threadgroup]],
ushort3 tptg [[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]) {
const int i3 = tgpig[2];
const int i2 = tgpig[1];
const int i1 = tgpig[0];
float corr_dims[2];
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
device const int32_t * pos = (device const int32_t *) src1;
const float inv_ndims = -1.f/args.n_dims;
float cos_theta;
float sin_theta;
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
if (i0 < args.n_dims) {
const int ic = i0/2;
// mrope theta calculations
// note: the rest is the same as kernel_rope_neox
const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
const int sector = ic % sect_dims;
float theta_base;
if (sector < args.sect_0) {
theta_base = (float) pos[i2];
} else if (sector < sec_w01) {
theta_base = (float) pos[i2 + args.ne02];
} else if (sector < sec_w012) {
theta_base = (float) pos[i2 + args.ne02 * 2];
} else {
theta_base = (float) pos[i2 + args.ne02 * 3];
}
// end of mrope
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
const float x0 = src[0];
const float x1 = src[args.n_dims/2];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
} else {
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
}
}
}
template<typename T>
kernel void kernel_rope_vision(
constant ggml_metal_kargs_rope & args,
device const char * src0,
device const char * src1,
device const char * src2,
device char * dst,
ushort tiitg[[thread_index_in_threadgroup]],
ushort3 tptg [[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]) {
const int i3 = tgpig[2];
const int i2 = tgpig[1];
const int i1 = tgpig[0];
float corr_dims[2];
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
device const int32_t * pos = (device const int32_t *) src1;
const float inv_ndims = -1.f/args.n_dims;
float cos_theta;
float sin_theta;
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
const int ic = i0/2;
// mrope theta calculations (only support 2 dimensions)
const int sect_dims = args.sect_0 + args.sect_1;
const int sector = ic % sect_dims;
float p;
float theta_base;
if (sector < args.sect_1) {
p = (float) sector;
theta_base = (float) pos[i2];
} else {
p = (float) sector - args.sect_0;
theta_base = (float) pos[i2 + args.ne02];
}
const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
// end of mrope
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
const float x0 = src[0];
const float x1 = src[args.n_dims]; // different from kernel_rope_multi
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
} else {
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
}
}
}
typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2722,6 +2862,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
typedef void (im2col_t)(
device const float * x,
device char * dst,
+5 -3
View File
@@ -52,9 +52,8 @@ target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
find_package(DNNL)
set(GGML_SYCL_DNNL 0)
if(DNNL_FOUND)
if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
# Assuming oneDNN packaged with oneapi release is used which
# supports only intel target
if (NOT DEFINED DNNL_GPU_VENDOR)
# default to intel target
set(DNNL_GPU_VENDOR "INTEL")
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
@@ -108,6 +107,9 @@ endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
# See https://github.com/uxlfoundation/oneMath/issues/654
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(SYCL_COMPILER ON)
endif()
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
+6
View File
@@ -483,7 +483,9 @@ class MODEL_TENSOR(IntEnum):
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_POS = auto()
V_ENC_ATTN_Q = auto()
V_ENC_ATTN_Q_NORM = auto()
V_ENC_ATTN_K = auto()
V_ENC_ATTN_K_NORM = auto()
V_ENC_ATTN_V = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_OUTPUT = auto()
@@ -742,7 +744,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
@@ -782,7 +786,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
+8
View File
@@ -938,6 +938,10 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
),
MODEL_TENSOR.V_ENC_ATTN_K: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vpm.encoder.layers.{bid}.self_attn.k_proj",
@@ -946,6 +950,10 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
),
MODEL_TENSOR.V_ENC_ATTN_V: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"vpm.encoder.layers.{bid}.self_attn.v_proj",
+1
View File
@@ -363,6 +363,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool op_offload; // whether to offload host tensor operations to device
};
// model quantization parameters
+5 -1
View File
@@ -318,7 +318,7 @@ else:
show = []
# Show CPU and/or GPU by default even if the hardware for all results is the same:
if "n_gpu_layers" not in properties_different:
if rows_full and "n_gpu_layers" not in properties_different:
ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
if ngl != 99 and "cpu_info" not in properties_different:
@@ -338,6 +338,10 @@ else:
pass
rows_show = get_rows(show)
if not rows_show:
logger.error(f"No comparable data was found between {name_baseline} and {name_compare}.\n")
sys.exit(1)
table = []
for row in rows_show:
n_prompt = int(row[-5])
+3 -1
View File
@@ -93,6 +93,7 @@ llama_context::llama_context(
}
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.op_offload = params.op_offload;
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -243,7 +244,7 @@ llama_context::llama_context(
}
}
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
@@ -1871,6 +1872,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.op_offload =*/ true,
};
return result;
+1
View File
@@ -30,6 +30,7 @@ struct llama_cparams {
bool flash_attn;
bool no_perf;
bool warmup;
bool op_offload;
enum llama_pooling_type pooling_type;
+1 -1
View File
@@ -853,7 +853,7 @@ int main(void) {
backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false);
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
+33 -2
View File
@@ -219,6 +219,7 @@ struct cmd_params {
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
std::vector<bool> no_op_offload;
ggml_numa_strategy numa;
int reps;
ggml_sched_priority prio;
@@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = {
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
/* use_mmap */ { true },
/* embeddings */ { false },
/* no_op_offload */ { false },
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) {
join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
printf(" -nopo, --no-op-offload <i> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
@@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-nopo" || arg == "--no-op-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
@@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.embeddings.empty()) {
params.embeddings = cmd_params_defaults.embeddings;
}
if (params.no_op_offload.empty()) {
params.no_op_offload = cmd_params_defaults.no_op_offload;
}
if (params.n_threads.empty()) {
params.n_threads = cmd_params_defaults.n_threads;
}
@@ -833,6 +846,7 @@ struct cmd_params_instance {
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool embeddings;
bool no_op_offload;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -902,6 +916,7 @@ struct cmd_params_instance {
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
return cparams;
}
@@ -921,6 +936,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & ot : params.tensor_buft_overrides)
for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nopo : params.no_op_offload)
for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
@@ -959,6 +975,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
};
instances.push_back(instance);
}
@@ -990,6 +1007,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
};
instances.push_back(instance);
}
@@ -1021,6 +1039,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
};
instances.push_back(instance);
}
@@ -1056,6 +1075,7 @@ struct test {
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool embeddings;
bool no_op_offload;
int n_prompt;
int n_gen;
int n_depth;
@@ -1089,6 +1109,7 @@ struct test {
tensor_buft_overrides = inst.tensor_buft_overrides;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
n_depth = inst.n_depth;
@@ -1134,7 +1155,7 @@ struct test {
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
};
return fields;
@@ -1146,7 +1167,7 @@ struct test {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
field == "avg_ns" || field == "stddev_ns") {
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1222,6 +1243,7 @@ struct test {
tensor_buft_overrides_str,
std::to_string(use_mmap),
std::to_string(embeddings),
std::to_string(no_op_offload),
std::to_string(n_prompt),
std::to_string(n_gen),
std::to_string(n_depth),
@@ -1404,6 +1426,9 @@ struct markdown_printer : public printer {
if (field == "test") {
return 15;
}
if (field == "no_op_offload") {
return 4;
}
int width = std::max((int) field.length(), 10);
@@ -1435,6 +1460,9 @@ struct markdown_printer : public printer {
if (field == "embeddings") {
return "embd";
}
if (field == "no_op_offload") {
return "nopo";
}
if (field == "tensor_split") {
return "ts";
}
@@ -1503,6 +1531,9 @@ struct markdown_printer : public printer {
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
fields.emplace_back("no_op_offload");
}
fields.emplace_back("test");
fields.emplace_back("t/s");
+1
View File
@@ -28,6 +28,7 @@ endif()
add_library(mtmd OBJECT
mtmd.cpp
mtmd-helper.cpp
mtmd.h
clip.cpp
clip.h
+2
View File
@@ -53,6 +53,8 @@
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
+23 -2
View File
@@ -205,6 +205,9 @@ struct clip_layer {
ggml_tensor * o_w = nullptr;
ggml_tensor * o_b = nullptr;
ggml_tensor * k_norm = nullptr;
ggml_tensor * q_norm = nullptr;
// layernorm 1
ggml_tensor * ln_1_w = nullptr;
ggml_tensor * ln_1_b = nullptr;
@@ -380,7 +383,7 @@ struct clip_ctx {
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
);
}
@@ -876,9 +879,15 @@ struct clip_graph {
// add CLS token
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
// The larger models use a different ViT, which uses RMS norm instead of layer norm
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
norm_t,
hparams.ffn_op,
model.position_embeddings,
nullptr);
@@ -1363,6 +1372,16 @@ private:
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
@@ -1988,6 +2007,8 @@ struct clip_model_loader {
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
+310
View File
@@ -0,0 +1,310 @@
#include "mtmd.h"
#include "llama.h"
#include <algorithm>
#include <cinttypes>
#include <vector>
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
size_t n_tokens = 0;
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
auto chunk = mtmd_input_chunks_get(chunks, i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens_text;
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
n_tokens += n_tokens_text;
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_tokens;
}
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
llama_pos n_pos = 0;
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
auto chunk = mtmd_input_chunks_get(chunks, i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens_text;
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
n_pos += n_tokens_text;
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_pos;
}
// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
std::vector<llama_pos> pos;
std::vector<llama_pos> pos_view; // used by mrope
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
}
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.reserve(n_tokens * n_pos_per_embd);
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
// assume n_tokens is less than or equal to batch.n_tokens
// batch.n_tokens is number of **total** tokens
// n_tokens is number of viewed token
size_t src_idx = i * batch.n_tokens + offset;
pos_view.insert(pos_view.end(),
pos.data() + src_idx,
pos.data() + src_idx + n_tokens);
}
pos_ptr = pos_view.data();
} else {
// normal
pos_ptr = pos.data() + offset;
}
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
/*logits =*/ batch.logits + offset,
};
}
};
// Helper function for decoding an image whose embeddings have already been calculated
int32_t mtmd_helper_decode_image_chunk(
mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past) {
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
return -1;
}
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (!image_tokens) {
LOG_ERR("failed to decode image chunk: image tokens are null\n");
return -1;
}
const llama_model * model = llama_get_model(lctx);
int n_mmproj_embd = llama_model_n_embd(model);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
}
while (i_batch < n_img_batches) { // split into batches
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
int32_t ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
return ret;
}
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
i_batch++;
}
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
*new_n_past = n_past;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
return 0;
}
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past) {
int32_t ret;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
// LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
size_t i = 0;
while (i < n_tokens) { // split into batches
text_batch.n_tokens = 0; // clear the batch
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
text_batch.n_tokens++;
text_batch.token [i] = tokens[i];
text_batch.pos [i] = n_past++;
text_batch.n_seq_id[i] = 1;
text_batch.seq_id [i][0] = seq_id;
text_batch.logits [i] = false;
}
bool is_last_token = (i == n_tokens);
if (logits_last && is_last_token) {
text_batch.logits[text_batch.n_tokens - 1] = true;
}
ret = llama_decode(lctx, text_batch);
if (ret != 0) {
LOG_ERR("failed to decode text\n");
llama_batch_free(text_batch);
return ret;
}
*new_n_past += text_batch.n_tokens;
}
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
int64_t t0 = ggml_time_ms();
LOG_INF("encoding image or slice...\n");
ret = mtmd_encode(ctx, image_tokens);
if (ret != 0) {
LOG_ERR("failed to encode image\n");
llama_batch_free(text_batch);
return ret;
}
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
float * embd = mtmd_get_output_embd(ctx);
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_batch_free(text_batch);
return ret;
}
} else {
GGML_ABORT("chunk type not supported");
}
return 0;
}
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past) {
size_t n_chunks = mtmd_input_chunks_size(chunks);
if (n_chunks == 0) {
LOG_ERR("no chunks to eval\n");
return 0;
}
for (size_t i = 0; i < n_chunks; i++) {
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
auto chunk = mtmd_input_chunks_get(chunks, i);
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
if (res != 0) {
LOG_ERR("failed to eval chunk %zu\n", i);
return res;
}
*new_n_past = n_past;
}
return 0;
}
+13 -311
View File
@@ -461,307 +461,26 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
return ctx->image_embd_v.data();
}
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
size_t n_tokens = 0;
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
auto chunk = mtmd_input_chunks_get(chunks, i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens_text;
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
n_tokens += n_tokens_text;
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
} else {
GGML_ASSERT(false && "chunk type not supported");
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
return true;
}
return n_tokens;
return false;
}
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
llama_pos n_pos = 0;
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
auto chunk = mtmd_input_chunks_get(chunks, i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens_text;
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
n_pos += n_tokens_text;
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_pos;
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
return ctx->use_mrope;
}
// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
std::vector<llama_pos> pos;
std::vector<llama_pos> pos_view; // used by mrope
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
}
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.reserve(n_tokens * n_pos_per_embd);
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
// assume n_tokens is less than or equal to batch.n_tokens
// batch.n_tokens is number of **total** tokens
// n_tokens is number of viewed token
size_t src_idx = i * batch.n_tokens + offset;
pos_view.insert(pos_view.end(),
pos.data() + src_idx,
pos.data() + src_idx + n_tokens);
}
pos_ptr = pos_view.data();
} else {
// normal
pos_ptr = pos.data() + offset;
}
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
/*logits =*/ batch.logits + offset,
};
}
};
// Helper function for decoding an image whose embeddings have already been calculated
int32_t mtmd_helper_decode_image_chunk(
mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past) {
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
return -1;
}
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (!image_tokens) {
LOG_ERR("failed to decode image chunk: image tokens are null\n");
return -1;
}
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
}
while (i_batch < n_img_batches) { // split into batches
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
int32_t ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
return ret;
}
if (ctx->print_timings) {
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
}
i_batch++;
}
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
*new_n_past = n_past;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
return 0;
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
mtmd_image_tokens_free(val);
}
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past) {
int32_t ret;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
LOG_DBG("decoding text chunk, n_tokens = %zu\n", n_tokens);
size_t i = 0;
while (i < n_tokens) { // split into batches
text_batch.n_tokens = 0; // clear the batch
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
text_batch.n_tokens++;
text_batch.token [i] = tokens[i];
text_batch.pos [i] = n_past++;
text_batch.n_seq_id[i] = 1;
text_batch.seq_id [i][0] = seq_id;
text_batch.logits [i] = false;
}
bool is_last_token = (i == n_tokens);
if (logits_last && is_last_token) {
text_batch.logits[text_batch.n_tokens - 1] = true;
}
ret = llama_decode(lctx, text_batch);
if (ret != 0) {
LOG_ERR("failed to decode text\n");
llama_batch_free(text_batch);
return ret;
}
*new_n_past += text_batch.n_tokens;
}
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
int64_t t0 = ggml_time_ms();
if (ctx->print_timings) {
LOG_INF("encoding image or slice...\n");
}
ret = mtmd_encode(ctx, image_tokens);
if (ret != 0) {
LOG_ERR("failed to encode image\n");
llama_batch_free(text_batch);
return ret;
}
if (ctx->print_timings) {
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
}
float * embd = mtmd_get_output_embd(ctx);
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_batch_free(text_batch);
return ret;
}
} else {
GGML_ABORT("chunk type not supported");
}
return 0;
}
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past) {
size_t n_chunks = mtmd_input_chunks_size(chunks);
if (n_chunks == 0) {
LOG_WRN("no chunks to eval\n");
return 0;
}
for (size_t i = 0; i < n_chunks; i++) {
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
auto chunk = mtmd_input_chunks_get(chunks, i);
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
if (res != 0) {
LOG_ERR("failed to eval chunk %zu\n", i);
return res;
}
*new_n_past = n_past;
}
return 0;
}
// these 2 helpers below use internal clip_image_u8_ptr,
// so unfortunately they cannot moved to mtmd-helper.h
// however, in theory, user can decode image file to bitmap using
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
@@ -787,23 +506,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
return mtmd_bitmap_init(nx, ny, data);
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
return true;
}
return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
return ctx->use_mrope;
}
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
mtmd_image_tokens_free(val);
}
//
// public API functions
//
+1
View File
@@ -10,6 +10,7 @@
#include <stdbool.h>
#ifdef __cplusplus
#include <string>
#include <vector>
#include <cinttypes>
#include <memory>
+1 -1
View File
@@ -1862,7 +1862,7 @@ struct server_context {
llama_context_params cparams_dft;
llama_batch batch;
llama_batch batch {};
bool clean_kv_cache = true;
bool add_bos_token = true;