context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870 )

ggml-ci
mtmd : fix 32-bit narrowing issue in export-lora and mtmd clip (#14503 )
2026-06-30 17:47:40 +02:00 · 2025-07-25 14:28:06 +03:00 · 2025-07-25 13:08:04 +02:00 · 2025-07-25 12:17:02 +02:00 · 2025-07-25 11:07:26 +03:00 · 2025-07-24 20:05:37 +01:00
29 changed files with 262 additions and 139 deletions
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc4.2.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@@ -515,7 +515,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64

    steps:
      - name: Clone
@@ -54,7 +54,7 @@ docker run --privileged -it \
    -v $HOME/llama.cpp/ci-cache:/ci-cache \
    -v $HOME/llama.cpp/ci-results:/ci-results \
    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
 ```

 Inside the container, execute the following commands:
@@ -6486,7 +6486,7 @@ class JaisModel(TextModel):
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)


-@ModelBase.register("Glm4ForCausalLM")
+@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
 class Glm4Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GLM4

@@ -6508,7 +6508,8 @@ class Glm4Model(TextModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        rope_dim = self.hparams["head_dim"]
+        if (rope_dim := self.hparams.get("head_dim")) is None:
+            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        rope_scaling = self.hparams.get("rope_scaling") or {}
        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
@@ -6516,6 +6517,13 @@ class Glm4Model(TextModel):
            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model.visual."): # ignore visual part of Glm4v
+            return []
+        elif name.startswith("model.language_model."):
+            name = name.replace("language_model.", "") # for Glm4v
+        return super().modify_tensors(data_torch, name, bid)
+

@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(TextModel):
@@ -68,6 +68,9 @@ cmake --build build --config Release
      cmake --build build-x64-windows-llvm-release
      ```
 - Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
+  - **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev`  # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
+  - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
+  - **Arch / Manjaro:** `sudo pacman -S curl`  # includes libcurl headers

 ## BLAS Build

@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc4.0.1`
+- `MUSA_VERSION` set to `rc4.2.0`

 The resulting images, are essentially the same as the non-MUSA images:

@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental,
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
+option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
+option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
@@ -1,12 +1,108 @@
+@PACKAGE_INIT@

@GGML_VARIABLES_EXPANDED@

-@PACKAGE_INIT@
+# Find all dependencies before creating any target.
+include(CMakeFindDependencyMacro)
+find_dependency(Threads)
+if (NOT GGML_SHARED_LIB)
+    set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
+    set(GGML_CPU_INTERFACE_LINK_OPTIONS   "")
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if(NOT ACCELERATE_FRAMEWORK)
+            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
+            return()
+        endif()
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENMP_ENABLED)
+        find_dependency(OpenMP)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind)
+        if(NOT memkind)
+            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
+            return()
+        endif()
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
+    endif()
+
+    if (GGML_BLAS)
+        find_dependency(BLAS)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
+    endif()
+
+    if (GGML_CUDA)
+        set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
+        find_dependency(CUDAToolkit)
+        if (GGML_STATIC)
+            list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
+            if (WIN32)
+                list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
+            else()
+                list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
+            endif()
+        endif()
+        if (NOT GGML_CUDA_NO_VMM)
+            list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
+        endif()
+    endif()
+
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation)
+        find_library(METAL_FRAMEWORK    Metal)
+        find_library(METALKIT_FRAMEWORK MetalKit)
+        if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
+            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
+            return()
+        endif()
+        set(GGML_METAL_INTERFACE_LINK_LIBRARIES
+            ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENCL)
+        find_dependency(OpenCL)
+        set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
+    endif()
+
+    if (GGML_VULKAN)
+        find_dependency(Vulkan)
+        set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
+    endif()
+
+    if (GGML_HIP)
+        find_dependency(hip)
+        find_dependency(hipblas)
+        find_dependency(rocblas)
+        set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_dependency(IntelSYCL)
+            find_dependency(MKL)
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
+endif()

 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
 #set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

+if(NOT TARGET ggml::ggml)
+
 find_package(Threads REQUIRED)

 find_library(GGML_LIBRARY ggml
@@ -29,66 +125,6 @@ set_target_properties(ggml::ggml-base
    PROPERTIES
        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")

-if (NOT GGML_SHARED_LIB)
-    if (APPLE AND GGML_ACCELERATE)
-        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
-    endif()
-
-    if (GGML_OPENMP)
-        find_package(OpenMP REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    endif()
-
-    if (GGML_CPU_HBM)
-        find_library(memkind memkind REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
-    endif()
-
-    if (GGML_BLAS)
-        find_package(BLAS REQUIRED)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
-        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
-    endif()
-
-    if (GGML_CUDA)
-        find_package(CUDAToolkit REQUIRED)
-    endif()
-
-    if (GGML_METAL)
-        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-        find_library(METAL_FRAMEWORK    Metal REQUIRED)
-        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-
-        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
-                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
-    endif()
-
-    if (GGML_VULKAN)
-        find_package(Vulkan REQUIRED)
-        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
-    endif()
-
-    if (GGML_HIP)
-        find_package(hip     REQUIRED)
-        find_package(hipblas REQUIRED)
-        find_package(rocblas REQUIRED)
-        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
-    endif()
-
-    if (GGML_SYCL)
-        find_package(DNNL)
-        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
-        endif()
-        if (WIN32)
-            find_package(IntelSYCL REQUIRED)
-            find_package(MKL       REQUIRED)
-            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
-        endif()
-    endif()
-endif()
-
 set(_ggml_all_targets "")
 foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
@@ -149,4 +185,6 @@ set_target_properties(ggml::all
    PROPERTIES
        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")

+endif() # TARGET ggml::ggml
+
 check_required_components(ggml)
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
    // pipeline parallelism support
    int n_copies;
    int cur_copy;
+    int next_copy;
    ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
    struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
    int n_graph_inputs;
@@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        }
    }

-    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
-
    return GGML_STATUS_SUCCESS;
 }

@@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

-    ggml_backend_sched_split_graph(sched, measure_graph);
-
    ggml_backend_sched_synchronize(sched);

+    ggml_backend_sched_split_graph(sched, measure_graph);
+
    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
        return false;
    }
@@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *

 bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+    GGML_ASSERT(!sched->is_alloc);
+
+    sched->cur_copy = sched->next_copy;
+    sched->next_copy = (sched->next_copy + 1) % sched->n_copies;

    ggml_backend_sched_split_graph(sched, graph);

@@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
        // if the graph is not already allocated, always use copy 0 after a synchronization
        // this ensures that during generation the same copy is used every time,
        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
-        sched->cur_copy = 0;
+        sched->next_copy = 0;
    }
 }

@@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    if (GGML_OPENMP)
        find_package(OpenMP)
        if (OpenMP_FOUND)
+            set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)

            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
        else()
+            set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
            message(WARNING "OpenMP not found")
        endif()
    endif()
@@ -14,7 +14,6 @@
 #include <cmath>
 #include <cstring>
 #include <cassert>
-#include <cstdlib> // for qsort
 #include <cstdio>  // for GGML_ASSERT

 #include "repack.h"
@@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu {
 };


-#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
+#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
 #define USE_CUDA_GRAPH
 #endif

@@ -1,9 +1,9 @@
 #include "cpy.cuh"
 #include "dequantize.cuh"
 #include "cpy-utils.cuh"
-#ifdef GGML_USE_MUSA
+#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
 #include "ggml-musa/mudnn.cuh"
-#endif // GGML_USE_MUSA
+#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY

 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
 // Copy destination pointers to GPU to be available when pointer indirection is in use

 void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
        CUDA_CHECK(cudaStreamSynchronize(stream));
        if (cuda_graph->dest_ptrs_d != nullptr) {
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg

    char ** dest_ptrs_d = nullptr;
    int graph_cpynode_index = -1;
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 #endif
    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
-#ifdef GGML_USE_MUSA
+#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
        if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
            CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
        } else
-#endif // GGML_USE_MUSA
+#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
        {
            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
        }
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
    }
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
    }
@@ -13,7 +13,7 @@
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
@@ -29,7 +29,7 @@
 #define cublasSgemm mublasSgemm
 #define cublasStatus_t mublasStatus_t
 #define cublasOperation_t mublasOperation_t
-#define cublasGetStatusString mublasStatus_to_string
+#define cublasGetStatusString mublasGetStatusString
 #define cudaDataType_t musaDataType_t
 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
@@ -1955,6 +1955,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
 static int ggml_metal_encode_node(
                        ggml_backend_t   backend,
                                   int   idx,
+                                   int   idx_end,
          id<MTLComputeCommandEncoder>   encoder,
            struct ggml_metal_mem_pool * mem_pool) {
    struct ggml_backend_metal_context        * ctx     = backend->context;
@@ -2181,7 +2182,9 @@ static int ggml_metal_encode_node(
                    size_t offs_fuse;
                    id<MTLBuffer> id_fuse;

-                    for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
+                    // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing nodes
+                    //       across splits. idx_end indicates the last node in the current split
+                    for (n_fuse = 0; n_fuse <= 6 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
                        if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
                            break;
                        }
@@ -4288,7 +4291,7 @@ static int ggml_metal_encode_node(
                    ops[1] = GGML_OP_MUL;
                    ops[2] = GGML_OP_ADD;

-                    for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
+                    for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
                        if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
                            break;
                        }
@@ -6271,7 +6274,11 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
            }

-            const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
+            const int res = ggml_metal_encode_node(backend, idx, node_end, encoder, mem_pool);
+            if (idx + res > node_end) {
+                GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
+                        "https://github.com/ggml-org/llama.cpp/pull/14849");
+            }

            if (should_capture) {
                [encoder popDebugGroup];
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
    file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
    list(APPEND GGML_SOURCES_MUSA ${SRCS})
-    file(GLOB   SRCS "../ggml-musa/*.cu")
-    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+
+    if (GGML_MUSA_MUDNN_COPY)
+        file(GLOB   SRCS "../ggml-musa/*.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        add_compile_definitions(GGML_MUSA_MUDNN_COPY)
+    endif()

    if (GGML_CUDA_FA_ALL_QUANTS)
        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
    add_compile_definitions(GGML_USE_MUSA)
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})

+    if (GGML_MUSA_GRAPHS)
+        add_compile_definitions(GGML_MUSA_GRAPHS)
+    endif()
+
    if (GGML_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
    endif()

    if (GGML_STATIC)
-        # TODO: mudnn has not provided static libraries yet
        target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
+        # TODO: mudnn has not provided static libraries yet
+        # if (GGML_MUSA_MUDNN_COPY)
+        #     target_link_libraries(ggml-musa PRIVATE mudnn_static)
+        # endif()
    else()
-        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
+        if (GGML_MUSA_MUDNN_COPY)
+            target_link_libraries(ggml-musa PRIVATE mudnn)
+        endif()
    endif()

    if (GGML_CUDA_NO_VMM)
@@ -1055,7 +1055,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
-    if (tensor == nullptr) {
+    if (tensor == nullptr || tensor->buffer == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
@@ -1124,7 +1124,7 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
-    if (tensor == nullptr) {
+    if (tensor == nullptr || tensor->buffer == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
@@ -1192,7 +1192,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
-    if (tensor == nullptr) {
+    if (tensor == nullptr || tensor->buffer == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
@@ -1229,7 +1229,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co

    ggml_tensor * src = deserialize_tensor(ctx, &request.src);
    ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
-    if (src == nullptr || dst == nullptr) {
+    if (src == nullptr || dst == nullptr || src->buffer == nullptr || dst->buffer == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
        return false;
    }
@@ -3531,7 +3531,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));

            const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
-            assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+            assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);

            {
                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
    };

    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
-        return { block_index * (traits::qk / traits::qr), 0 };
+        return { block_index * (QK4_0 / QR4_0), 0 };
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
+        return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        auto nblocks = (nrows * (ncols / traits::qk));
-        return { nblocks * (QK_K / 2),
+        auto nblocks = (nrows * (ncols / QK_K));
+        return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
                 (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
-
-    constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
 };

 template <> struct block_q_t<GGML_TYPE_Q6_K> {
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
    };

    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
-        auto low_bits_index  = block_index * (traits::qk / traits::qr);
+        auto low_bits_index  = block_index * (QK_K / QR6_K);
        // the index of high bits it's after all low bits
        auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
        return { low_bits_index, high_bits_index };
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
-        auto nblocks        = (nrows * (ncols / traits::qk));
+        auto nblocks        = (nrows * (ncols / QK_K));
        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
        auto block_scales   = total_qs_bytes + block_index * (QK_K / 16);
-        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16);
+        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
        return { block_scales, sb_scale };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 };
+
 }  // namespace ggml_sycl_reordered

 #endif  // GGML_SYCL_QUANTS_HPP
@@ -350,11 +350,9 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
                                     const sycl::half2 * q8_1_ds, const int & iqs) {
-        const int ib = ibx_offset.first / (QK_K / 2);
-
        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
        const uint8_t *    qs             = base + ibx_offset.first;
-        const uint8_t *    scs            = base + d_offset.first + ib * K_SCALE_SIZE;
+        const uint8_t *    scs            = base + d_offset.first;
        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);

        const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
@@ -427,13 +425,11 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
                     const int iqs) {
-        const int ib = ibx_offset.first / (QK_K / 2);
-
        const uint8_t *   base   = static_cast<const uint8_t *>(vbq);
        const uint8_t *   ql     = base + ibx_offset.first;
        const uint8_t *   qh     = base + ibx_offset.second;
        const int8_t *    scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
-        const ggml_half * d      = (const ggml_half *) (base + d_offset.second) + ib;
+        const ggml_half * d      = (const ggml_half *) (base + d_offset.second);

        const int bq8_offset   = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
        const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
@@ -956,6 +956,7 @@ extern "C" {
    // in the order they have appeared in the batch.
    // Rows: number of tokens for which llama_batch.logits[i] != 0
    // Cols: n_vocab
+    // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);

    // Logits for the ith token. For positive indices, Equivalent to:
@@ -970,6 +971,7 @@ extern "C" {
    // in the order they have appeared in the batch.
    // shape: [n_outputs*n_embd]
    // Otherwise, returns NULL.
+    // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1 +1 @@
-3323219cd3cc050e5c7133cd4fc1e50d1f590faf
+56938c4a3b2d923f42040f9ad32d229c76c466cd
@@ -1933,12 +1933,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
        }
    },
-    {
-        LLM_ARCH_UNKNOWN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
-    },
    {
        LLM_ARCH_DREAM,
        {
@@ -1956,6 +1950,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_UNKNOWN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
 };

 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -718,10 +718,9 @@ int32_t llm_chat_apply_template(
            }

            ss << message->content << "<|im_end|>";
-
-            if (add_ass) {
-                ss << "<|im_assistant|>assistant<|im_middle|>";
-            }
+        }
+        if (add_ass) {
+            ss << "<|im_assistant|>assistant<|im_middle|>";
        }
    } else {
        // template not supported
@@ -105,7 +105,7 @@ llama_context::llama_context(

    {
        const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;

        if (!supports_set_rows && !cparams.kv_unified) {
            LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
 }

 float * llama_context::get_logits() {
+    output_reorder();
+
    return logits;
 }

 float * llama_context::get_logits_ith(int32_t i) {
    int64_t j = -1;

+    output_reorder();
+
    try {
        if (logits == nullptr) {
            throw std::runtime_error("no logits");
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
 }

 float * llama_context::get_embeddings() {
+    output_reorder();
+
    return embd;
 }

 float * llama_context::get_embeddings_ith(int32_t i) {
    int64_t j = -1;

+    output_reorder();
+
    try {
        if (embd == nullptr) {
            throw std::runtime_error("no embeddings");
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
        }
    }

+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
+
    // TODO: hacky solution
    if (model.arch == LLM_ARCH_T5 && t_embd) {
        //cross.t_embd = t_embd;
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {

    // TODO: this clear of the buffer can easily be forgotten - need something better
    embd_seq.clear();
+    output_swaps.clear();

    bool did_optimize = false;

@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
        // make the outputs have the same order they had in the user-provided batch
        // note: this is mostly relevant for recurrent models atm
        if (!sorted_output) {
-            const uint32_t n_vocab = model.vocab.n_tokens();
-            const uint64_t n_embd  = model.hparams.n_embd;
-
            GGML_ASSERT((size_t) n_outputs == out_ids.size());

            // TODO: is there something more efficient which also minimizes swaps?
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    continue;
                }
                std::swap(out_ids[i], out_ids[j_min]);
-                if (logits_size > 0) {
-                    for (uint32_t k = 0; k < n_vocab; k++) {
-                        std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                    }
-                }
-                if (embd_size > 0) {
-                    for (uint32_t k = 0; k < n_embd; k++) {
-                        std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                    }
-                }
+
+                // remember the swaps and apply them lazily upon logits/embeddings access
+                output_swaps.push_back({ i, j_min });
            }

            std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
    // wait for the computation to finish (automatically done when obtaining the model output)
    //synchronize();

+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
+
    return 0;
 }

@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    return n_outputs_max;
 }

+void llama_context::output_reorder() {
+    const uint32_t n_vocab = model.vocab.n_tokens();
+    const uint64_t n_embd  = model.hparams.n_embd;
+
+    for (uint32_t s = 0; s < output_swaps.size(); ++s) {
+        const uint32_t i0 = output_swaps[s].i0;
+        const uint32_t i1 = output_swaps[s].i1;
+
+        if (logits_size > 0) {
+            for (uint32_t k = 0; k < n_vocab; k++) {
+                std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
+            }
+        }
+
+        if (embd_size > 0) {
+            for (uint32_t k = 0; k < n_embd; k++) {
+                std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
+            }
+        }
+    }
+
+    output_swaps.clear();
+}
+
 //
 // graph
 //
@@ -181,6 +181,8 @@ private:
    // Returns max number of outputs for which space was reserved.
    uint32_t output_reserve(int32_t n_outputs);

+    void output_reorder();
+
    //
    // graph
    //
@@ -250,6 +252,13 @@ private:

    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

+    struct swap_info {
+        uint32_t i0;
+        uint32_t i1;
+    };
+
+    std::vector<swap_info> output_swaps;
+
    ggml_backend_sched_ptr sched;

    ggml_backend_t backend_cpu = nullptr;
@@ -278,6 +287,10 @@ private:

    bool has_evaluated_once = false;

+    // env: LLAMA_SET_ROWS (temporary)
+    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
+    bool supports_set_rows = false;
+
    // perf
    mutable int64_t t_start_us  = 0;
    mutable int64_t t_load_us   = 0;
@@ -646,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);

+                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
+                hparams.rope_finetuned = true;
+
                switch (hparams.n_layer) {
                    case 52: type = LLM_TYPE_1B; break;
                    case 40: type = LLM_TYPE_2B; break;
@@ -148,7 +148,7 @@ struct lora_merge_ctx {

        ctx_out = gguf_init_empty();
        struct ggml_init_params params = {
-            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
@@ -2315,7 +2315,7 @@ struct clip_model_loader {

        // create data context
        struct ggml_init_params params = {
-            /*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
+            /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc =*/ true,
        };
Author	SHA1	Message	Date
Georgi Gerganov	c1dbea752a	context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870 ) ggml-ci	2025-07-25 14:28:06 +03:00
kiwi	749e0d27f0	mtmd : fix 32-bit narrowing issue in export-lora and mtmd clip (#14503 ) * [fix] Fix 32-bit narrowing issue in export-lora and mtmd clip * Update export-lora.cpp * Update clip.cpp * Update export-lora.cpp * format: use space to replace tab	2025-07-25 13:08:04 +02:00
Chris Rohlf	64bf1c3744	rpc : check for null buffers in get/set/copy tensor endpoints (#14868 )	2025-07-25 12:17:02 +02:00
Diego Devesa	c12bbde372	sched : fix multiple evaluations of the same graph with pipeline parallelism (#14855 ) ggml-ci	2025-07-25 11:07:26 +03:00
R0CKSTAR	3f4fc97f1d	musa: upgrade musa sdk to rc4.2.0 (#14498 ) * musa: apply mublas API changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: update musa version to 4.2.0 Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore MUSA graph settings in CMakeLists.txt Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: disable mudnnMemcpyAsync by default Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: switch back to non-mudnn images Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * minor changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore rc in docker image tag Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-07-24 20:05:37 +01:00
Georgi Gerganov	2df255da3c	sync : ggml ggml-ci	2025-07-24 20:27:23 +03:00
Kai Pastor	60f816a79d	cmake : fix usage issues (ggml/1257) * CMake config: Create target only once Fix error on repeated find_package(ggml). For simplicity, check only for the top-level ggml::ggml. * CMake config: Add CUDA link libs * CMake config: Add OpenCL link libs * CMake config: Use canonical find_dependency Use set and append to control link lib variables. Apply more $<LINK_ONLY...>. * CMake config: Wire OpenMP dependency	2025-07-24 20:27:23 +03:00
Daniel Bevenius	5592f278b6	ggml-cpu : remove stdlib include from repack.cpp (ggml/1276) This commit removes the inclusion of `<cstdlib>`. The motivation for this change is that this source file does not seem to use any functions from this header and the comment about `qsort` is a little misleading/confusing.	2025-07-24 20:27:23 +03:00
Georgi Gerganov	e4868d16d2	context : perform output reorder lazily upon access after sync (#14853 ) * context : perform output reorder after lazily upon access after sync ggml-ci * cont : add TODO	2025-07-24 16:31:48 +03:00
Xuan-Son Nguyen	820de57d4f	chat : fix kimi-k2 chat template (#14852 )	2025-07-24 13:59:56 +02:00
Alberto Cabrera Pérez	cb4a63aad6	sycl: fixed semantics of block offset calculation (#14814 )	2025-07-24 11:09:57 +01:00
yummy	86f5623d90	llama : fix MiniCPM inference after Granite Four changes (#14850 ) MiniCPM models use the llm_build_granite constructor which was changed in the Granite Four PR to use hparams.rope_finetuned instead of a use_rope parameter. MiniCPM models need rope enabled by default. Fixes inference from gibberish to correct responses.	2025-07-24 11:50:51 +02:00
Pouya	39cffdf188	docs: add libcurl-dev install hint for Linux distros (#14801 ) * docs: add libcurl-dev install hint for Linux distros Signed-off-by: PouyaGhahramanian <PooyaGhahramanian@gmail.com> * Update docs/build.md --------- Signed-off-by: PouyaGhahramanian <PooyaGhahramanian@gmail.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>	2025-07-24 11:26:44 +02:00
Georgi Gerganov	065908cb09	metal : fix fusion across different encoders (#14849 ) * metal : fix fusion across different encoders ggml-ci * cont : add assertion ggml-ci	2025-07-24 10:24:05 +03:00
Donghyeon Jeong	4ec6291a24	sycl: fix undefined variable in work group size check (#14843 )	2025-07-24 12:50:41 +08:00
jacekpoplawski	a12363bbf0	convert : text-only support for GLM-4.1V-9B-Thinking (#14823 ) * use language_model part only, ignore visual layers * fix rope_dim calculation	2025-07-23 23:23:57 +02:00