Removing extra blank lines that were breaking Lint. (#8067 )

cvector: fix CI + correct help message (#8064 )
* cvector: fix CI + correct help message * also correct --pca-iter
2026-06-16 10:46:43 +02:00 · 2024-06-22 14:28:18 -04:00 · 2024-06-22 18:11:30 +02:00 · 2024-06-22 17:19:37 +02:00 · 2024-06-22 15:37:41 +02:00 · 2024-06-22 15:16:10 +02:00
134 changed files with 42688 additions and 35671 deletions
@@ -26,3 +26,7 @@ indent_size = 2

 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
+
+[examples/cvector-generator/*.txt]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
@@ -42,7 +42,6 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
-            - codecov.yml
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
@@ -1,5 +1,7 @@
- Self Reported Review Complexity:
-    - [ ] Review Complexity : Low
-    - [ ] Review Complexity : Medium
-    - [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+
+
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+- Self-reported review complexity:
+  - [ ] Low
+  - [ ] Medium
+  - [ ] High
@@ -1,40 +0,0 @@
-name: Code Coverage
-on: [push, pull_request]
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  run:
-    runs-on: ubuntu-20.04
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 lcov
-
-      - name: Build
-        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
-
-      - name: Run tests
-        run: CC=gcc-8 make test
-
-      - name: Generate coverage report
-        run: |
-          make coverage
-          make lcov-report
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        env:
-           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        with:
-          files: lcov-report/coverage.info
@@ -87,8 +87,22 @@ jobs:
            exit 1
          fi

+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DLLAMA_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DLLAMA_OPENMP=OFF ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
      - name: Build
        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DLLAMA_NATIVE=OFF \
@@ -1,90 +1,123 @@
-*.o
+# Extensions
+
 *.a
-*.so
+*.bat
+*.bin
+*.dll
+*.dot
+*.etag
+*.exe
+*.gcda
+*.gcno
+*.gcov
 *.gguf
 *.gguf.json
-*.bin
-*.exe
-*.dll
-*.log
-*.gcov
-*.gcno
-*.gcda
-*.dot
-*.bat
-*.tmp
-*.metallib
-*.etag
 *.lastModified
-.DS_Store
-.build/
+*.log
+*.metallib
+*.o
+*.so
+*.tmp
+
+# IDE / OS
+
 .cache/
 .ccls-cache/
 .direnv/
+.DS_Store
 .envrc
+.idea/
 .swiftpm
-.venv
-.clang-tidy
 .vs/
 .vscode/
-.idea/
+nppBackup

-ggml-metal-embed.metal

-lcov-report/
+# Coverage
+
 gcovr-report/
+lcov-report/
+
+# Build Artifacts

 tags
+.build/
 build*
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
 !build.zig
-cmake-build-*
+/libllama.so
+/llama-*
 android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
 out/
 tmp/

+# CI
+
+!.github/workflows/*.yml
+
+# Models
+
 models/*
 models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*

-/Pipfile
-/libllama.so
-/llama-*
-llama-batched-swift
-/common/build-info.cpp
-arm_neon.h
-compile_commands.json
-CMakeSettings.json
-
-__pycache__
-dist
+# Zig

 zig-out/
 zig-cache/

+# Logs
+
 ppl-*.txt
 qnt-*.txt
 perf-*.txt

+# Examples
+
 examples/jeopardy/results.txt
+examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.css.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh

+# Python
+
+__pycache__
+.venv
+/Pipfile
+dist
 poetry.lock
 poetry.toml
-nppBackup

 # Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
+/tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
+/tests/test-grammar-parser
+/tests/test-llama-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
+/tests/test-rope
 /tests/test-sampling
 /tests/test-tokenizer-0
-/tests/test-tokenizer-1-spm
 /tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
+/tests/test-tokenizer-1-spm
+
+# Scripts
+!/scripts/install-oneapi.bat
@@ -119,6 +119,7 @@ option(LLAMA_HIP_UMA                         "llama: use HIP unified memory arch
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
+option(LLAMA_VULKAN_MEMORY_DEBUG             "llama: enable Vulkan memory debug output"         OFF)
 option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
 option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
@@ -534,6 +535,10 @@ if (LLAMA_VULKAN)
            add_compile_definitions(GGML_VULKAN_DEBUG)
        endif()

+        if (LLAMA_VULKAN_MEMORY_DEBUG)
+            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
+        endif()
+
        if (LLAMA_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
@@ -660,6 +665,7 @@ if (LLAMA_SYCL)
    #todo: AOT

    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)

    message(STATUS "SYCL found")

@@ -674,21 +680,22 @@ if (LLAMA_SYCL)
    endif()

    add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})

    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
    if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
    endif()

    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+    file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
+    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")

    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
+        add_compile_options(-I/${SYCL_INCLUDE_DIR})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
        if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
@@ -11,9 +11,21 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "LLAMA_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },

    {
@@ -35,15 +47,18 @@
    },

    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },

    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },

    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
  ]
 }
@@ -38,6 +38,7 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-train-text-from-scratch \
 	llama-vdot \
+	llama-cvector-generator \
 	tests/test-c.o

 # Binaries only useful for tests
@@ -506,7 +507,7 @@ ifdef LLAMA_CUDA
 		CUDA_PATH ?= /usr/local/cuda
 	endif
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	OBJS         += $(OBJS_CUDA_TEMP_INST)
@@ -607,6 +608,10 @@ ifdef LLAMA_VULKAN_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
 endif

+ifdef LLAMA_VULKAN_MEMORY_DEBUG
+	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
+endif
+
 ifdef LLAMA_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@@ -922,6 +927,10 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1042,7 +1051,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -1,6 +1,7 @@
 # llama.cpp for SYCL

 - [Background](#background)
+- [Recommended Release](#recommended-release)
 - [News](#news)
 - [OS](#os)
 - [Hardware](#hardware)
@@ -31,8 +32,23 @@ When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneM

 It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

+## Recommended Release
+
+The SYCL backend would be broken by some PRs due to no online CI.
+
+The following release is verified with good quality:
+
+|Commit ID|Tag|Release|Verified  Platform|
+|-|-|-|-|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+
+
 ## News

+- 2024.5
+  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
+  - Arch Linux is verified successfully.
+
 - 2024.4
  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.

@@ -394,15 +410,9 @@ Output (example):

 4. Install build tools

-a. Download & install cmake for Windows: https://cmake.org/download/
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
+b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)

-b. Download & install mingw-w64 make for Windows provided by w64devkit
-
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
-
- Extract `w64devkit` on your pc.
-
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).

 ### II. Build llama.cpp

@@ -412,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release

 # Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

 cmake --build build --config Release -j
 ```
@@ -425,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
 .\examples\sycl\win-build-sycl.bat
 ```

+Or, use CMake presets to build:
+```sh
+cmake --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake --preset x64-windows-sycl-debug
+cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+```
+
+Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+
 *Notes:*

- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.

 ### III. Run the inference

@@ -195,6 +195,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RAGNA Desktop](https://ragna.app/) (proprietary)
 - [RecurseChat](https://recurse.chat/) (proprietary)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
@@ -208,6 +209,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
+- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

@@ -386,6 +388,30 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668

+### Nix
+
+On Mac and Linux, the Nix package manager can be used via
+```
+nix profile install nixpkgs#llama-cpp
+```
+For flake enabled installs.
+
+Or
+```
+nix-env --file '<nixpkgs>' --install --attr llama-cpp
+```
+For non-flake enabled installs.
+
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+
+#### Flox
+
+On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+```
+flox install llama-cpp
+```
+Flox follows the nixpkgs build of llama.cpp.
+
 ### Metal Build

 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@@ -1,14 +0,0 @@
-comment: off
-
-coverage:
-  status:
-    project:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
-    patch:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
@@ -6,7 +6,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <codecvt>
@@ -542,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
        else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
        else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+        else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
        else { invalid_param = true; }
        return true;
    }
@@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.out_file = argv[i];
+        params.cvector_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.i_chunk = std::stoi(argv[i]);
        return true;
    }
+    // cvector params
+    if (arg == "--completions-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.cvector_completions_file = argv[i];
+        return true;
+    }
+    if (arg == "--positive-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.cvector_positive_file = argv[i];
+        return true;
+    }
+    if (arg == "--negative-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.cvector_negative_file = argv[i];
+        return true;
+    }
+    if (arg == "--completions") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_completions = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--pca-batch") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_pca_batch = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--pca-iter") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_pca_iterations = std::stoi(argv[i]);
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@@ -1820,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param

    options.push_back({ "backend" });
    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
+
    if (llama_supports_mlock()) {
        options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
    }
@@ -1931,6 +1982,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
 #endif // LOG_DISABLE_LOGS

+    options.push_back({ "cvector" });
+    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
+    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
+    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
+    options.push_back({ "cvector",     "       --completions-file FNAME",
+                                                                        "completions file (default: '%s')", params.cvector_completions_file.c_str() });
+    options.push_back({ "cvector",     "       --completions N",        "number of lines of completions file to use (default: %d)", params.n_completions });
+    options.push_back({ "cvector",     "       --pca-batch N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
+    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
+
    printf("usage: %s [options]\n", argv[0]);

    for (const auto & o : options) {
@@ -2597,7 +2658,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        }

        // Set the output file
-        std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
        if (!outfile) {
            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
            return false;
@@ -73,7 +73,6 @@ struct gpt_params {
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
-    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@@ -232,6 +231,15 @@ struct gpt_params {

    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
+
+    // cvector-generator params
+    int n_completions = 64;
+    int n_pca_batch = 20;
+    int n_pca_iterations = 1000;
+    std::string cvector_outfile          = "control_vector.gguf";
+    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
+    std::string cvector_positive_file    = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file    = "examples/cvector-generator/negative.txt";
 };

 void gpt_params_handle_model_default(gpt_params & params);
@@ -214,7 +214,7 @@ src_func = f"""
 """

 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
-convert_py = convert_py_pth.read_text()
+convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
@@ -222,7 +222,7 @@ convert_py = re.sub(
    flags=re.DOTALL | re.MULTILINE,
 )

-convert_py_pth.write_text(convert_py)
+convert_py_pth.write_text(convert_py, encoding="utf-8")

 logger.info("+++ convert-hf-to-gguf.py was updated")

@@ -967,7 +967,11 @@ class XverseModel(Model):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-        assert max(tokenizer.vocab.values()) < vocab_size
+        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
+        # because vocab_size is the count of items, and indexes start at 0.
+        max_vocab_index = max(tokenizer.get_vocab().values())
+        if max_vocab_index >= vocab_size:
+            raise ValueError("Vocabulary size exceeds expected maximum size.")

        reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()
@@ -1632,6 +1636,12 @@ class Qwen2MoeModel(Model):
        super().set_gguf_parameters()
        if (n_experts := self.hparams.get("num_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
+        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
+            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
+            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")

    _experts: list[dict[str, Tensor]] | None = None

@@ -12,6 +12,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
 else()
+    add_subdirectory(cvector-generator)
    add_subdirectory(baby-llama)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
@@ -0,0 +1,5 @@
+set(TARGET llama-cvector-generator)
+add_executable(${TARGET} cvector-generator.cpp pca.hpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,34 @@
+# cvector-generator
+
+This example demonstrates how to generate a control vector using gguf models.
+
+Related PRs:
+- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
+
+## Examples
+
+```sh
+# CPU only
+./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
+
+# With GPU
+./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
+
+# With advanced options
+./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
+
+# To see help message
+./cvector-generator -h
+# Then, have a look at "cvector" section
+```
+
+## Tips and tricks
+
+If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
+
+```
+<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
+<|im_start|>system\nYou are in a very good mood today<|im_end|>
+```
@@ -0,0 +1,582 @@
+
+That game
+I can see
+Hmm, this
+I can relate to
+Who is
+I understand the
+Ugh,
+What the hell was
+Hey, did anyone
+Although
+Thank you for choosing
+What are you
+Oh w
+How dare you open
+It was my pleasure
+I'm hon
+I appreciate that you
+Are you k
+Whoever left this
+It's always
+Ew,
+Hey, I l
+Hello? Is someone
+I understand that
+That poem
+Aww, poor
+Hey, it
+Alright, who
+I didn't
+Well, life
+The document
+Oh no, this
+I'm concerned
+Hello, this is
+This art
+Hmm, this drink
+Hi there!
+It seems
+Is
+Good
+I can't
+Ex
+Who are
+I can see that
+Wow,
+Today is a
+Hey friend
+Sometimes friends
+Oh, this old
+The weather outside
+This place is sur
+I appreciate your input
+Thank you for the
+Look at
+I'm disappoint
+To my
+How dare you
+That's an
+This piece of art
+Eww
+This park is
+This is incredible
+Oh no, someone
+Exc
+Well, it'
+I warned
+Hey, I understand
+Hey, I saw
+How dare you go
+What the he
+Hey
+It's
+Hello? Hello?
+It
+Oh no!
+This is the perfect
+Good morning,
+Oh no, there
+It's so
+Yeah
+Uh,
+Hello everyone
+Who turned off
+The weather
+Who'
+Hey, this
+Wait,
+Eww, gross
+Excuse
+It seems like you
+Thank you so
+What happened?
+Oh my g
+I am deeply sad
+I war
+Okay, let'
+Hey, that
+That was a beautiful
+Oh no! That
+What happened
+Hey there
+The artist'
+What?!
+Hey, it'
+I am disappoint
+It seems like
+Oh no! The
+This park is a
+If you
+Yes! I did
+It sounds
+What
+Who is it
+Hmm, that
+That's strange
+Yeah, that was
+That's interesting
+This park
+What the hell
+Who is that
+I feel like my
+Oh well
+What the hell is
+Hello? Hello
+To my dearest
+Bless you!\"
+Thank you for
+Oh, looks like
+Can you please
+This place is
+Eww, what
+Bless you
+Is everything
+Hey, I just
+Whoever left these
+Well, that'
+I feel
+Hey, do you
+It's sad
+Oh no, it
+Hey, that'
+Oh my god,
+Thank you,
+Hello little one,
+I apolog
+Hey team, I
+How dare you read
+Who is this and
+Whoever left
+Hi there! W
+A
+If you have
+I was
+U
+Bless
+Well, this
+Oh, I'
+It's a
+Eww,
+Is everything okay?
+Oh, I
+Hello, can you
+Al
+That was a great
+What are
+I understand that not
+Oh no, not
+Who is it?\"
+Hey, can we
+Whoever is taking
+I would love to
+Hey, I noticed
+Hey, could
+I understand that there
+Hello?
+D
+Oh man, I
+Thank you so much
+Oh no, my
+Dear [Name
+Uh
+I remember
+Hey, who
+Well, it
+Are you
+I understand that it
+Hey, is
+I would
+Who is this
+Excuse me
+Alright
+I am thrilled
+Sometimes friends have
+Who the
+It's interesting
+I would love
+E
+Hello? Is anyone
+Well, this is
+This place
+Well,
+I warned you
+Hey, watch where
+Oh my
+That'
+Sometimes friends have different
+I understand that everyone
+What?
+What do these notes
+I can relate
+I'm not
+I understand
+To my dear
+Guys
+Well
+Hey, I appreciate
+Wow, what
+Dear
+That melody
+Who the hell
+Today is
+Hello little
+Wow, look
+That's great
+Love is never wrong
+I'm having
+Whoa, did
+Ugh
+Can you please provide
+I miss you,
+I feel uncom
+I know
+Ugh, this
+Hey, watch
+Oh great, a
+I didn
+Okay
+That game of char
+Oh
+I appreciate
+Who's there
+I am so
+Oh great, someone
+Hey, could you
+I remember wondering
+Wait, what?
+What do
+Hello? Can
+Hey there,
+That game of
+This is incred
+Oh my gosh
+Oh great, f
+I appreciate your
+It sounds like
+What the heck
+Okay, I understand
+Ew
+I understand that this
+Uh, hi
+Hi everyone!
+What the hell?
+Thank you for your
+Oh no, the
+Wow, I
+Who turned
+Dear [
+Whoever
+This is a
+Whoa, he
+What in the world
+Although the physical
+Hello, who is
+That's amaz
+Hey, I know
+Okay, that
+Hi everyone
+Hey, is everything
+I understand your fr
+Oh no, poor
+Oh, look
+Good morning
+Ew, gross
+Oh no, did
+Look at the family
+Hey team
+Yes!
+Hey, can I
+Okay, that'
+It's great
+Love is
+Hey, what
+Good morning, world
+Who is it?
+That poem really reson
+I
+That's
+I understand the task
+Gu
+Hello? Who'
+This postcard is
+Whoa,
+Oh, that
+I understand that I
+Whoever is
+Hello? Who is
+I'm really
+Wow, this
+Can
+This artwork really
+This is a shame
+I miss you too
+Who are you?
+Today is a difficult
+Hey, just
+Are you okay
+I am
+Hi,
+Wow, that
+Hey there! Can
+Okay, stay
+Oh great, just
+Yeah,
+Hello? Can you
+Oh, looks
+Thank you for sharing
+I'm glad
+Hey, is that
+Hmm
+It was my
+It sounds like you
+Wow, your
+I was promised certain
+That was such a
+Thank
+Excuse you
+That was
+Hey team,
+I feel un
+It was
+What'
+Hey friend, I
+How
+Saying goodbye
+That
+It's heart
+How dare
+Oh,
+Hello, may
+What's this
+Thank you for recogn
+Aww, that
+Oh, I remember
+Hmm, that'
+I miss
+I know this
+Wait
+Is everything okay
+Who is that person
+Wow, you
+Oh great
+I'm sad
+Wow, the
+I am very disappoint
+Who turned off the
+I understand that things
+I'm very
+Hi
+That's very
+Okay, I
+Oh no,
+Wow, there
+What's wrong
+I apologize for
+Hey, I
+Can I help you
+Oh, I didn
+Alright,
+Oh wow,
+Oh my goodness
+I know this event
+What in the
+Saying
+Yeah, that
+Guys, I
+Hey, this v
+This post
+Are
+Hey, can
+Hello? Is
+I can only imagine
+Oh, that sounds
+Hey, is anyone
+I am disappointed
+Hello,
+Hey everyone, I
+That was such
+It's okay
+The artist
+Whoa
+I understand that mistakes
+Can I help
+Who
+Hi everyone! I
+Hey, can you
+Wow, how
+Today
+Oh no, I
+Oh well, I
+Well, that
+This is the
+Yes! I finally
+Hey there little
+Hello everyone!
+Love is never
+Look at the
+This postcard
+Oh great,
+Can I
+Hmm, this is
+I understand your
+Oh, look at
+B
+I'm so
+Whoa, this
+W
+Oh, this
+Sometimes
+This piece of
+What the
+That was a
+Hey, do
+Oh no
+Whoa, what
+I feel like I
+The documentary
+Hello
+Hello little one
+I understand that my
+Eww, that
+Wow, an
+Yes! Finally,
+Although the physical location
+Whoever is watching
+That movie
+I remember wondering about
+Hey there, little
+Who's
+Hello, who
+Hello everyone! Thank
+Hello, can
+That's too
+Hey, just wanted
+Hey there, I
+Saying good
+Hey there!
+Who is there?
+Oh my good
+I am very
+Oh no, what
+Wow, thank
+I was promised
+Hi, is
+Hey, I'
+Guys, the
+Oh no, that
+Who is there
+Hello, this
+That movie really touched
+If you have something
+The documentary was
+I'm starting
+Are you kidd
+That movie really
+Hey everyone,
+Thank you for considering
+I didn'
+Yes! I
+Can you
+Oh my god
+Hey, whoever
+That melody really
+Thank you, little
+Hello, may I
+Look
+Wow, we
+It looks
+What do these
+Oh wow
+I apologize
+What are you all
+It's such
+It's clear
+Hey, I was
+Hey friend,
+I can only
+The weather outside is
+Eww, this
+I miss you
+Wow
+Aww,
+Hi, is there
+This artwork
+Okay,
+Oh well,
+This
+I'
+Say
+Hey there little gu
+Hmm,
+Whoa, who
+I am thr
+Oh man
+Okay, stay calm
+I'm happy
+Oh, this cur
+Oh man,
+I'm sorry
+Hello? Who
+What?! That
+This piece
+Hey everyone
+That's so
+Are you okay?
+What happened? Where
+Hi there
+The
+Who the hell entered
+I can
+Guys,
+What's
+What in
+It's important
+I'm
+I'm coming
+It'
+Yes! Finally
+Wait, what
+Wow, reading
+I'm surprised
+Hey, did
+Hey,
+Okay, let
+I understand that you
+Who the hell threw
+Eww, who
+Thank you for thinking
+Who is this?\"
+I am deeply
+Thank you for including
+Oh no, an
+It looks like you
+Aww
+I'm confused
+Wow, it
+That poem really
+Yes
+Hey there, is
+Hey, what'
+Thank you for remember
+To
+This is
+Thank you for making
+I can'
+That mel
+Wow, they
+I feel like
+Although the
+Who are you
+Love
+If
+What the hell are
+I am so sad
+Oh, I found
+Thank you
+It looks like
+Well, life is
+I appreciate that
+The artist's
+Whoa, that
+It's never
@@ -0,0 +1,499 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+#include "pca.hpp"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cstdio>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <climits>
+
+
+//////////////////////////////////////////////////
+// utils
+
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += llama_token_to_piece(ctx, *begin);
+    }
+
+    return ret;
+}
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    printf("\nexample usage:\n");
+    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n");
+}
+
+//////////////////////////////////////////////////
+
+
+// cb_eval is reused for each pair of positive - negative prompt
+struct callback_data {
+    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
+
+    int n_layers = 0;
+    int n_tokens = 0;
+    bool is_eval_pos = true;
+
+    // each element of the vector correspond to one layer
+    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
+
+    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
+    void save_tensor_for_layer(struct ggml_tensor * t) {
+        GGML_ASSERT(t->type == GGML_TYPE_F32);
+
+        if (ctx_ggml == nullptr) {
+            // alloc a new ctx_ggml if needed
+            struct ggml_init_params params_ggml = {
+                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ctx_ggml = ggml_init(params_ggml);
+        }
+
+        // copy tensor data
+        auto n_bytes = ggml_nbytes(t);
+        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
+        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
+        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
+        ggml_set_name(t_layer, ggml_get_name(t));
+        //print_debug_tensor(t_layer);
+
+        if (is_eval_pos) {
+            v_pos.push_back(t_layer);
+        } else {
+            v_neg.push_back(t_layer);
+        }
+    }
+
+    // calculate diff (v_pos - v_neg) and place the result back to v_pos
+    // all zero rows in the diff tensor will also be removed
+    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
+    std::vector<struct ggml_tensor *> calc_diff() {
+        for (float il = 0; il < v_pos.size(); il++) {
+            float * a = (float *) v_pos[il]->data;
+            float * b = (float *) v_neg[il]->data;
+            size_t n_elem = ggml_nelements(v_pos[il]);
+            for (size_t j = 0; j < n_elem; j++) {
+                a[j] -= b[j];
+            }
+            //print_debug_tensor(v_pos[i]);
+            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
+            v_diff_filtered.push_back(diff_filtered);
+        }
+        return v_diff_filtered; // for convinient, we return the result std::vector
+    }
+
+    // delete zero rows from a given 2D tensor
+    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
+        //printf("filter_nonzero_rows\n");
+        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
+            // check if given row containing all zero elements
+            int n_cols = t->ne[0]; // hint: should be equal to n_embd
+            for (int col = 0; col < n_cols; ++col) {
+                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
+        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
+            if (!is_row_all_zeros(a, i_row, 1e-6)) {
+                rows_to_copy.push_back(i_row);
+            }
+        }
+
+        // get "n_nonzero_rows" for the output "diff_filtered"
+        int n_nonzero_rows = rows_to_copy.size();
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        int n_embd = a->ne[0];
+        GGML_ASSERT(n_nonzero_rows > 0);
+
+        // diff_filtered: [n_embd, n_nonzero_rows]
+        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
+            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
+        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
+        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
+
+        // copy non-zero rows
+        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
+            int src_row = rows_to_copy[dest_row];
+            for (int i = 0; i < n_embd; i++) {
+                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
+                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
+            }
+        }
+
+        //print_debug_tensor(diff_filtered);
+
+        return diff_filtered;
+    }
+
+    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
+    void reset() {
+        for (auto ptr : v_pos) free(ptr->data);
+        for (auto ptr : v_neg) free(ptr->data);
+        for (auto ptr : v_diff_filtered) free(ptr->data);
+        v_pos.clear();
+        v_neg.clear();
+        v_diff_filtered.clear();
+        if (ctx_ggml) {
+            ggml_free(ctx_ggml);
+        }
+        ctx_ggml = nullptr;
+    }
+};
+
+/**
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
+ * in short, input => v_diff and output => v_final
+ */
+struct train_context {
+    ggml_context * ctx_ggml;
+    int n_embd;
+    int n_layers;
+
+    /* pair of prompts to be used for generating final vector */
+    std::vector<std::string> positive_entries;
+    std::vector<std::string> negative_entries;
+
+    // each element of the vector correspond to one layer
+    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
+    // NOTE (2): v_diff is transposed from v_diff_tmp
+    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
+
+    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
+    // v_diff_tmp will get converted unto v_diff later on
+    std::vector<std::vector<uint8_t>> v_diff_tmp;
+
+    train_context(int n_embd_, int n_layers_) {
+        n_embd = n_embd_;
+        n_layers = n_layers_;
+        struct ggml_init_params params_ggml = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_ggml = ggml_init(params_ggml);
+        for (int il = 0; il < n_layers - 1; il++) {
+            std::vector<uint8_t> empty;
+            v_diff_tmp.push_back(empty);
+            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
+            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
+            v_final.push_back(t);
+        }
+    }
+
+    // add new rows into existing tensor in v_diff_tmp
+    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto t = diff_filtered[il];
+            auto & diff_tmp = v_diff_tmp[il];
+            size_t curr_size = diff_tmp.size();
+            diff_tmp.resize(curr_size + ggml_nbytes(t));
+            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
+        }
+    }
+
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
+    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
+    void build_v_diff() {
+        printf("build_v_diff\n");
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto & diff_tmp = v_diff_tmp[il];
+            int n_elem = diff_tmp.size() / sizeof(float);
+            GGML_ASSERT(n_elem % n_embd == 0);
+            int n_rows = n_elem / n_embd;
+            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
+            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
+            // copy data & transpose
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+            float * arr = (float *) diff_tmp.data();
+            for (int ir = 0; ir < n_rows; ++ir) {
+                for (int ic = 0; ic < n_embd; ++ic) {
+                    float f = arr[ir*n_embd + ic];
+                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                }
+            }
+            v_diff.push_back(diff);
+            print_debug_tensor(diff);
+            // free memory of diff_tmp
+            diff_tmp.resize(0);
+        }
+    }
+
+    ~train_context() {
+        for (auto ptr : v_final) free(ptr->data);
+        for (auto ptr : v_diff) free(ptr->data);
+        // no need to free v_diff_tmp, since we didn't use malloc
+        ggml_free(ctx_ggml);
+    }
+};
+
+struct tokenized_prompt {
+    std::vector<llama_token> tokens_pos;
+    std::vector<llama_token> tokens_neg;
+    size_t max_seq_len;
+
+    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
+        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
+        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
+        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
+        padding_seq(ctx, tokens_pos, max_seq_len);
+        padding_seq(ctx, tokens_neg, max_seq_len);
+    }
+
+    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+        // TODO: customize padding token
+        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
+        llama_token pad_tok = pad_tokens.back();
+        while (tokens.size() < len) {
+            tokens.push_back(pad_tok);
+        }
+    }
+};
+
+//////////////////////////////////////////////////
+
+template <typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
+    std::vector<std::string> output;
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
+        exit(1);
+    }
+    std::string line;
+    while (std::getline(file, line)) {
+        bool is_skip = skip_empty_lines && line.empty();
+        if (!is_skip) {
+            string_process_escapes(line);
+            output.push_back(line);
+        }
+    }
+    file.close();
+    return output;
+}
+
+//////////////////////////////////////////////////
+
+static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+    static const char * l_out_name = "l_out";
+    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
+
+    if (ask) {
+        return is_l_out;
+    }
+
+    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
+        return true;
+    }
+
+    // save the tensor to current context
+    cb_data->save_tensor_for_layer(t);
+    return true;
+}
+
+static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
+    llama_kv_cache_clear(ctx);
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+    return true;
+}
+
+static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    const std::string arch = "controlvector";
+    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
+    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
+    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
+
+    for (size_t i = 0; i < v_ctrl.size(); ++i) {
+        gguf_add_tensor(ctx, v_ctrl[i]);
+        print_debug_tensor(v_ctrl[i]);
+        printf("Added tensor: %s\n", v_ctrl[i]->name);
+    }
+
+    printf("%s: writing file...\n", __func__);
+    gguf_write_to_file(ctx, fname.c_str(), false);
+    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
+    gguf_free(ctx);
+}
+
+/**
+ * Load prompt files and completion file.
+ * Then format each pair of prompt + completion to make an entry.
+ */
+static int prepare_entries(gpt_params & params, train_context & ctx_train) {
+    // load prompts
+    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
+    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
+    if (positive_prompts.size() != negative_prompts.size()) {
+        fprintf(stderr, "number of positive and negative prompts must be equal\n");
+        return 1;
+    }
+    if (positive_prompts.empty()) {
+        fprintf(stderr, "must provide at least one prompt pair\n");
+        return 1;
+    }
+
+    // create templated prompts
+    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
+    auto format_template = [](std::string persona, std::string suffix) {
+        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
+        return persona + suffix;
+    };
+    for (size_t i = 0; i < positive_prompts.size(); ++i) {
+        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
+            // TODO replicate the truncations done by the python implementation
+            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
+            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
+        }
+    }
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
+    }
+
+    if (params.n_pca_iterations % params.n_pca_batch != 0) {
+        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
+        return 1;
+    }
+
+
+    callback_data cb_data;
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = cb_eval;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    print_build_info();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model to get hparams
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    // int n_ctx = llama_n_ctx(ctx);
+    int n_layers = llama_n_layer(model);
+    int n_embd = llama_n_embd(model);
+    // get model hint param (a.k.a model arch name)
+    char model_hint[128];
+    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+
+    // init train_context
+    train_context ctx_train(n_embd, n_layers);
+
+    // load and prepare entries for training
+    prepare_entries(params, ctx_train);
+
+    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
+    std::vector<tokenized_prompt> tokenized_prompts;
+    size_t n_total_tokens = 0;
+    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
+        n_total_tokens += 2 * t.max_seq_len;
+        tokenized_prompts.push_back(std::move(t));
+    }
+
+    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
+
+    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        bool success = false;
+        tokenized_prompt t = tokenized_prompts[i];
+        cb_data.n_layers = n_layers;
+        cb_data.n_tokens = t.max_seq_len;
+
+        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
+            (int) i+1, (int) ctx_train.positive_entries.size(),
+            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
+            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
+            (int) t.max_seq_len);
+
+        cb_data.is_eval_pos = true;
+        success = get_hidden_layers(ctx, t.tokens_pos);
+        if (!success) break;
+
+        cb_data.is_eval_pos = false;
+        success = get_hidden_layers(ctx, t.tokens_neg);
+        if (!success) break;
+
+        // calculate diff and remove all zero rows
+        auto v_diff_filtered = cb_data.calc_diff();
+
+        // save & concat the filtered v_diff to ctx_train
+        ctx_train.concat_diff_tmp(v_diff_filtered);
+
+        // reset for next iteration
+        cb_data.reset();
+    }
+
+    // done with the model, we can now free it to make gain some memory
+    printf("Done evaluate prompts, unload model...\n");
+    llama_free(ctx);
+    llama_free_model(model);
+
+    // prepare ctx_train for PCA
+    ctx_train.build_v_diff();
+
+    // run PCA
+    PCA::pca_params pca_params;
+    pca_params.n_threads = params.n_threads;
+    pca_params.n_batch = params.n_pca_batch;
+    pca_params.n_iterations = params.n_pca_iterations;
+    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+
+    // write output vectors to gguf
+    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
+
+    llama_backend_free();
+
+    return 0;
+}
@@ -0,0 +1 @@
+[INST] Act like a person who is extremely sad. [/INST] 
@@ -0,0 +1,322 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cstdio>
+#include <ctime>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+
+#define DEBUG_POS 5
+
+static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
+    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
+    if (!with_data) return;
+    printf("%s: %s[0] = [", __func__, t->name);
+    for (size_t i = 0; i <= DEBUG_POS; i++) {
+        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
+    }
+    printf(" ... ]\n");
+}
+
+namespace PCA {
+
+// input params for PCA computations
+struct pca_params {
+    int n_threads = 1;
+    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
+    int n_iterations = 1000;
+    float tolerance = 1e-7;
+
+    // for debugging
+    int i_layer = 0;
+    int n_layers = 0;
+};
+
+// result from each iteration
+struct pca_result {
+    struct ggml_tensor * calculated_square = NULL;
+    std::vector<struct ggml_tensor *> eigenvectors;
+    std::vector<float> distances;
+};
+
+struct pca_model {
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;      // context to compute graph on target device
+    struct ggml_context * ctx_host; // host context to store results
+
+    // tensors on target device
+    struct ggml_tensor * dev_input;
+    struct ggml_tensor * dev_square;
+    struct ggml_tensor * dev_eigenvector;
+
+    pca_model(struct ggml_tensor * t_input) {
+#ifdef GGML_USE_CUDA
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        backend = ggml_backend_cuda_init(0); // init device 0
+        if (!backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+#endif
+
+// TODO: enable Metal support when support for GGML_OP_SQRT is added
+// #ifdef GGML_USE_METAL
+//         fprintf(stderr, "%s: using Metal backend\n", __func__);
+//         backend = ggml_backend_metal_init();
+//         if (!backend) {
+//             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+//         }
+// #endif
+
+        // if there aren't GPU Backends fallback to CPU backend
+        if (!backend) {
+            backend = ggml_backend_cpu_init();
+        }
+
+        const int num_tensors = 4;
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx = ggml_init(params);
+
+        auto n_samples = t_input->ne[0];
+        auto n_embd    = t_input->ne[1];
+
+        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
+        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
+        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        ggml_set_name(dev_input,       "dev_input");
+        ggml_set_name(dev_square,      "dev_square");
+        ggml_set_name(dev_eigenvector, "dev_eigenvector");
+        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
+
+        // initialize eigenvector to random normalized vector
+        {
+            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
+            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
+            std::uniform_real_distribution<float> distribution(0.0, 1.0);
+            float sum_sqr = 0.0; // for normalizing random_vec
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                float f = distribution(generator);
+                sum_sqr += f * f;
+                random_vec[i] = f;
+            }
+            // normalize it
+            float random_vec_norm = std::sqrt(sum_sqr);
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                random_vec[i] /= random_vec_norm;
+            }
+            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
+        }
+    }
+
+    ~pca_model() {
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+        ggml_backend_free(backend);
+    }
+};
+
+static struct ggml_cgraph * build_graph_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        bool calc_square = false) {
+    GGML_ASSERT(params.n_batch > 0);
+    // TODO: buf_size must be able to scale with params.n_batch
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    // turn v_diff_original into square matrix if needed
+    struct ggml_tensor * tmp_square;
+    if (calc_square) {
+        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
+        ggml_set_name(tmp_square, "tmp_square");
+    }
+
+    struct ggml_tensor * b_tensor;
+    struct ggml_tensor * distance;
+    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
+    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
+
+    for (int i = 0; i < params.n_batch; ++i) {
+        // b_tensor = square * eigenvector^T
+        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
+        ggml_set_name(b_tensor, "b_tensor");
+
+        // normalize
+        b_tensor = ggml_div_inplace(ctx0,
+            b_tensor,
+            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
+        );
+        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
+
+        // calculate distance(new eigenvector - old eigenvector)
+        // we don't use ggml_sub because it may not be implemented on GPU backend
+        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
+        distance = ggml_sqrt_inplace(ctx0,
+            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
+        ggml_format_name(distance, "distance_%d", i);
+
+        old_eigen = b_tensor;
+
+        // build operations nodes
+        ggml_build_forward_expand(gf, distance);
+    }
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+static ggml_status compute_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        struct ggml_cgraph * gf,
+        ggml_gallocr_t allocr,
+        struct pca_result & result) {
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
+    }
+
+// TODO: enable GPU support when support for GGML_OP_SQRT is added
+//#ifdef GGML_USE_METAL
+//    if (ggml_backend_is_metal(model.backend)) {
+//        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
+//    }
+//#endif
+
+    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
+    if (res == GGML_STATUS_SUCCESS) {
+        auto extract_i = [](std::string prefix, std::string str) -> int {
+            int i = -1;
+            if (str.rfind(prefix, 0) == 0) {
+                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
+            }
+            return i;
+        };
+        result.calculated_square = NULL;
+        result.eigenvectors.clear();
+        result.distances.clear();
+        result.eigenvectors.resize(params.n_batch);
+        result.distances.resize(params.n_batch);
+        // get output nodes
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            auto node = gf->nodes[i];
+            int iter = -1;
+            // find b_tensor (without copying data from device)
+            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
+                result.eigenvectors[iter] = node;
+            }
+            // find distances, then copy data from device
+            if ((iter = extract_i("distance_", node->name)) > -1) {
+                float d;
+                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
+                result.distances[iter] = d;
+                // std::cout << node->name << " = " << d << "\n";
+            }
+            // find tmp_square if it exists (without copying data from device)
+            if (std::string(node->name) == "tmp_square") {
+                result.calculated_square = node;
+            }
+        }
+    }
+    return res;
+}
+
+static void power_iteration(
+        const struct pca_params & params,
+        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
+        struct ggml_tensor * output) {
+    //printf("in power iteration\n");
+    struct pca_model model(input);
+
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    struct pca_result result;
+    struct ggml_tensor * last_eigenvector = NULL;
+
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    for (int iter = 0; iter < n_iters; ++iter) {
+        bool calc_square = (iter == 0); // only need to calculate square for first iteration
+        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        compute_piter(params, model, gf, allocr, result);
+
+        for (size_t k = 0; k < result.distances.size(); ++k) {
+            last_eigenvector = result.eigenvectors[k];
+            if (result.distances[k] < params.tolerance) {
+                break; // done
+            }
+        }
+
+        if (calc_square) {
+            // copy and store the square matrix if needed
+            GGML_ASSERT(result.calculated_square != NULL);
+            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
+        }
+
+        {
+            // copy last eigen vector and store as input for next iteration
+            GGML_ASSERT(last_eigenvector != NULL);
+            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
+        }
+
+        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
+            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
+    }
+
+    // get output tensor
+    GGML_ASSERT(last_eigenvector);
+    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
+    //print_debug_tensor(output);
+    ggml_gallocr_free(allocr);
+}
+
+static void run_pca(
+        struct pca_params & params,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running PCA...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+
+        // run power_iteration
+        params.i_layer = il;
+        params.n_layers = v_input.size();
+        power_iteration(params, v_input[il], ctrl_out);
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
@@ -0,0 +1 @@
+[INST] Act like a person who is extremely happy. [/INST] 
@@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
    return lines;
 }

-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }

@@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu

        // try to get sequence embeddings - supported only when pooling_type is not NONE
        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");

        float * out = output + batch.seq_id[i][0] * n_embd;
        //TODO: I would also add a parameter here to enable normalization or not.
@@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        // clear previous kv_cache values (irrelevant for embeddings)
        llama_kv_cache_clear(ctx);
+        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);

        // run model
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
    llama_token eos_token = llama_token_eos(mdl);

    llama_kv_cache_clear(ctx);
+    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);
+
    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {

    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);

-    // create new context - set to embedding mode
-    cparams.embeddings = true;
+    // create generation context
    llama_context * ctx = llama_new_context_with_model(mdl, cparams);

    // ### Embedding/Representation ###
@@ -223,7 +223,11 @@ int main(int argc, char ** argv) {
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(model));
+
+    const llama_token middle_token = llama_token_middle(model);
+    if (middle_token >= 0) {
+        embd_inp.push_back(middle_token);
+    }

    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@@ -528,7 +532,12 @@ int main(int argc, char ** argv) {
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(model));
+
+                const llama_token middle_token = llama_token_middle(model);
+                if (middle_token >= 0) {
+                    embd_inp.push_back(middle_token);
+                }
+
                embd.clear();
                n_remain = params.n_predict;
                n_past = 0;
@@ -131,22 +131,29 @@ class LlamaState: ObservableObject {

        messageLog += "\(text)"

-        while await llamaContext.n_cur < llamaContext.n_len {
-            let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
+        Task.detached {
+            while await llamaContext.n_cur < llamaContext.n_len {
+                let result = await llamaContext.completion_loop()
+                await MainActor.run {
+                    self.messageLog += "\(result)"
+                }
+            }
+
+            let t_end = DispatchTime.now().uptimeNanoseconds
+            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
+            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+
+            await llamaContext.clear()
+
+            await MainActor.run {
+                self.messageLog += """
+                    \n
+                    Done
+                    Heat up took \(t_heat)s
+                    Generated \(tokens_per_second) t/s\n
+                    """
+            }
        }
-
-        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
-        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
-
-        await llamaContext.clear()
-        messageLog += """
-            \n
-            Done
-            Heat up took \(t_heat)s
-            Generated \(tokens_per_second) t/s\n
-            """
    }

    func bench() async {
@@ -16,41 +16,41 @@ struct quant_option {
 };

 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
    { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
-    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
+    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
-    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization"   ,          },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M"                   },
+    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization",             },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
-    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
-    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
-    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B", },
-    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M",                  },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M",                  },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
+    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
+    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing",  },
 };

 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
@@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
    return chunks;
 }

-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }

@@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
@@ -1594,7 +1594,7 @@ struct server_context {
                    } else {
                        std::string prompt;
                        if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
-                            json_value(task.data, "prompt", std::string());
+                            prompt = json_value(task.data, "prompt", std::string());
                        }

                        slot = get_available_slot(prompt);
@@ -2038,7 +2038,12 @@ struct server_context {
                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                            prefix_tokens.push_back(llama_token_middle(model));
+
+                            const llama_token middle_token = llama_token_middle(model);
+                            if (middle_token >= 0) {
+                                prefix_tokens.push_back(middle_token);
+                            }
+
                            prompt_tokens = prefix_tokens;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
@@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main

 ::  build all binary
-make -j
+cmake --build . -j
 if %errorlevel% neq 0 goto ERROR

 cd ..
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1717786204,
-        "narHash": "sha256-4q0s6m0GUcN7q+Y2DqD27iLvbcd1G50T2lv08kKxkSI=",
+        "lastModified": 1718318537,
+        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "051f920625ab5aabe37c920346e3e69d7d34400e",
+        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
        "type": "github"
      },
      "original": {
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
            // check if a backend with higher prio wants to offload the op
            if (src_backend_id == sched->n_backends - 1) {
                for (int b = 0; b < src_backend_id; b++) {
-                    if (ggml_backend_offload_op(sched->backends[b], tensor)) {
+                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
                        return b;
                    }
@@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    bool backend_ids_changed = false;
    for (int i = 0; i < sched->graph->n_nodes; i++) {
-        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
+        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
+            sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
            backend_ids_changed = true;
            break;
        }
    }
    if (!backend_ids_changed) {
        for (int i = 0; i < sched->graph->n_leafs; i++) {
-            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
+            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
+                sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
                backend_ids_changed = true;
                break;
            }
@@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
    return sched->n_copies;
 }

+int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
+    return sched->n_backends;
+}
+
+ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
+    GGML_ASSERT(i >= 0 && i < sched->n_backends);
+    return sched->backends[i];
+}
+
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -182,6 +182,9 @@ extern "C" {
    // Initialize backend buffers from a measure graph
    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
+
    // Get the number of splits of the last graph
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
        }

        const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
    }
    return row_rounding;
 }
@@ -2267,6 +2267,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SQR:
            ggml_cuda_op_sqr(ctx, dst);
            break;
+        case GGML_OP_SQRT:
+            ggml_cuda_op_sqrt(ctx, dst);
+            break;
        case GGML_OP_CLAMP:
            ggml_cuda_op_clamp(ctx, dst);
            break;
@@ -2830,6 +2833,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
 }

 // Round rows to this value for --split-mode row:
-static int get_mmq_y_host(const int cc, const int mmq_x) {
-    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+static int get_mmq_y_host(const int cc) {
+    return cc >= CC_VOLTA ? 128 : 64;
 }

 //////////////////////
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
-            mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
            break;
        case GGML_TYPE_Q4_1:
-            mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
            break;
        case GGML_TYPE_Q5_0:
-            mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
            break;
        case GGML_TYPE_Q5_1:
-            mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
            break;
        case GGML_TYPE_Q8_0:
-            mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
            break;
        case GGML_TYPE_Q2_K:
-            mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
            break;
        case GGML_TYPE_Q3_K:
-            mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
            break;
        case GGML_TYPE_Q4_K:
-            mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
            break;
        case GGML_TYPE_Q5_K:
-            mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
            break;
        case GGML_TYPE_Q6_K:
-            mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
            break;
        default:
            GGML_ASSERT(false);
@@ -8,6 +8,7 @@
 #include <cstdint>

 #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
+#define MMQ_NWARPS 8

 typedef void (*load_tiles_mmq_t)(
    const char * __restrict__ x, int * __restrict__ x_qs, half2 * __restrict__ x_dm,
@@ -15,7 +16,7 @@ typedef void (*load_tiles_mmq_t)(
 typedef void (*vec_dot_mmq_t)(
    const int * __restrict__ x_qs, const half2 * __restrict__ x_dm, const int * __restrict__ x_sc,
    const int * __restrict__ y, float * __restrict__ sum, const int & k0);
-typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1);
+typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);

 struct block_q8_1_mmq {
    half2  ds[4];
@@ -50,21 +51,17 @@ static constexpr __device__ int get_mmq_x_max_device() {

 // get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row

+static constexpr __device__ int get_mmq_y_device() {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-static constexpr __device__ int get_mmq_y_device(int mmq_x) {
-    return mmq_x >= 32 ? 128 : 64;
-}
+    return 128;
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
-static constexpr __device__ int get_mmq_y_device(int mmq_x) {
-    return mmq_x >= 32 ? 128 : 64;
-}
+    return 128;
 #else
-static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) {
    return 64;
-}
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}

 #define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
 #define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0}
@@ -1734,30 +1731,34 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
 }

 template<int mmq_x, int mmq_y, int nwarps, bool need_check>
-static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
+static __device__ __forceinline__ void mmq_write_back_dp4a(
+    const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
+
 #pragma unroll
    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
-        const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
+        const int j = j0 + threadIdx.y;

-        if (j >= ne1) {
+        if (j > j_max) {
            return;
        }

 #pragma unroll
        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
-            const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
+            const int i = i0 + threadIdx.x;

-            if (need_check && i >= ne0) {
+            if (need_check && i > i_max) {
                continue;
            }

-            dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+            dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
        }
    }
 }

 template<int mmq_x, int mmq_y, int nwarps, bool need_check>
-static __device__ __forceinline__ void mmq_write_back_mma(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
+static __device__ __forceinline__ void mmq_write_back_mma(
+    const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
+
    typedef mma_int_C_I16J8 mma_C;

    const int i0 = threadIdx.y*mma_C::I;
@@ -1769,19 +1770,19 @@ static __device__ __forceinline__ void mmq_write_back_mma(const float * __restri
    for (int j0 = 0; j0 < mmq_x; j0 += mma_C::J) {
 #pragma unroll
        for (int l = 0; l < mma_C::ne; ++l) {
-            const int j = blockIdx.y*mmq_x + j0 + mma_C::get_j(l);
+            const int j = j0 + mma_C::get_j(l);

-            if (j >= ne1) {
+            if (j > j_max) {
                continue;
            }

-            const int i = blockIdx.x*mmq_y + i0 + mma_C::get_i(l);
+            const int i = i0 + mma_C::get_i(l);

-            if (need_check && i >= ne0) {
+            if (need_check && i > i_max) {
                continue;
            }

-            dst[j*ne0 + i] = sum[(j0/mma_C::J)*mma_C::ne + l];
+            dst[j*stride + i] = sum[(j0/mma_C::J)*mma_C::ne + l];
        }
    }
 }
@@ -1896,32 +1897,16 @@ static bool mmq_need_sum(const ggml_type type_x) {
    return false;
 }

-template <ggml_type type, int mmq_x, int nwarps, bool need_check>
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#else
-#if __CUDA_ARCH__ >= CC_VOLTA
-    __launch_bounds__(WARP_SIZE*nwarps, 1)
-#else
-    __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-static __global__ void mul_mat_q(
-    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
-    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
-
-    // Skip unused template specializations for faster compilation:
-    if (mmq_x > get_mmq_x_max_device()) {
-        NO_DEVICE_CODE;
-        return;
-    }
+template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
+static __device__ void mul_mat_q_process_tile(
+    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
+    const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0,
+    const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) {

    constexpr int              qk         = ggml_cuda_type_traits<type>::qk;
    constexpr int              qr         = ggml_cuda_type_traits<type>::qr;
    constexpr int              qi         = ggml_cuda_type_traits<type>::qi;
-    constexpr int              mmq_y      = get_mmq_y_device(mmq_x);
+    constexpr int              mmq_y      = get_mmq_y_device();
    constexpr int              vdr        = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
    constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;

@@ -1941,20 +1926,18 @@ static __global__ void mul_mat_q(
    int   * tile_x_sc = (int   *) (tile_x_dm + txs.dm);
    int   * tile_y    = (int   *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]

-    const int blocks_per_row_x = ne00 / qk;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ne1 = ne11;
-
-    const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
-
-    const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
+    constexpr int blocks_per_warp = WARP_SIZE / qi;

    float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};

-    for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
+    const int tile_x_max_i = ne01 - it*mmq_y - 1;
+    const int tile_y_max_j = ne11 - jt*mmq_x - 1;

-        load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
+    const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
+
+    for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_warp) {
+
+        load_tiles(x, tile_x_qs, tile_x_dm, tile_x_sc, stride01*it*mmq_y + kb0, tile_x_max_i, stride01);

 #pragma unroll
        for (int kr = 0; kr < qr; ++kr) {
@@ -1977,7 +1960,176 @@ static __global__ void mul_mat_q(
        }
    }

-    write_back(sum, dst, ne0, ne1);
+    if (fixup) {
+        write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
+    } else {
+        write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
+    }
+}
+
+
+// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+    __launch_bounds__(WARP_SIZE*nwarps, 1)
+#else
+    __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static __global__ void mul_mat_q(
+    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
+    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
+
+    // Skip unused template specializations for faster compilation:
+    if (mmq_x > get_mmq_x_max_device()) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    constexpr int qk    = ggml_cuda_type_traits<type>::qk;
+    constexpr int qi    = ggml_cuda_type_traits<type>::qi;
+    constexpr int mmq_y = get_mmq_y_device();
+
+    // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
+#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+    {
+        constexpr bool fixup = false;
+        mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+            (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+                blockIdx.x, blockIdx.y, 0, ne00/qk);
+        return;
+    }
+#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
+
+    const     int64_t blocks_per_ne00 = ne00 / qk;
+    constexpr int     blocks_per_warp = WARP_SIZE / qi;
+
+    const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x
+    const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y
+
+    // kbc == k block continuous, current index in continuous ijk space.
+    int64_t       kbc      = GGML_PAD((int64_t) blockIdx.x     *blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
+    const int64_t kbc_stop = GGML_PAD((int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x, blocks_per_warp);
+
+    // kb0 == k index when doing the matrix multiplication for an output tile.
+    int kb0_start = kbc % blocks_per_ne00;
+    int kb0_stop  = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
+    while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
+        const int jt =  kbc /    (blocks_per_ne00*nty);                    // j index of current tile.
+        const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile.
+
+        constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
+        mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+            (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+             it, jt, kb0_start, kb0_stop);
+
+        kbc += blocks_per_ne00;
+        kbc -= kbc % blocks_per_ne00;
+
+        kb0_start = 0;
+        kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+    }
+
+    if (kbc >= kbc_stop) {
+        return;
+    }
+
+    const int jt =  kbc /    (blocks_per_ne00*nty);
+    const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
+
+    constexpr bool fixup = true; // Last index writes it data to fixup buffer to avoid data races with other blocks.
+    mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+        (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+            it, jt, kb0_start, kb0_stop);
+}
+
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+static __global__ void mul_mat_q_stream_k_fixup(
+    float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) {
+
+    constexpr int     mmq_y           = get_mmq_y_device();
+    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
+    constexpr int     qi              = ggml_cuda_type_traits<type>::qi;
+    constexpr int     blocks_per_warp = WARP_SIZE / qi;
+    const     int64_t blocks_per_ne00 = ne00 / qk;
+
+    float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
+
+    const int ntx = (ne11 + mmq_x - 1) / mmq_x;
+    const int nty = (ne01 + mmq_y - 1) / mmq_y;
+
+    bool any_fixup = false;
+
+    const int bidx_start = (blockIdx.y*nty + blockIdx.x)     * block_num_mmq / (gridDim.y*gridDim.x);
+    const int bidx_stop  = (blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq / (gridDim.y*gridDim.x) + 1;
+
+    for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) {
+        const int64_t kbc      = GGML_PAD((int64_t) bidx     *blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
+        const int64_t kbc_stop = GGML_PAD((int64_t)(bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq, blocks_per_warp);
+
+        // Skip fixup tile if the MMQ CUDA block never wrote anything to it:
+        if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) {
+            continue;
+        }
+
+        const int jt =  kbc_stop /    (blocks_per_ne00*nty);
+        const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
+
+        // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
+        if (it != blockIdx.x || jt != blockIdx.y) {
+            continue;
+        }
+
+        any_fixup = true;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
+            }
+        }
+    }
+
+    if (!any_fixup) {
+        return;
+    }
+
+    dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y;
+
+    const int i_max = ne01 - blockIdx.x*mmq_y - 1;
+    const int j_max = ne11 - blockIdx.y*mmq_x - 1;
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+        if (j > j_max) {
+            return;
+        }
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            if (need_check && i > i_max) {
+                continue;
+            }
+
+            dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+        }
+    }
 }

 struct mmq_args {
@@ -1987,124 +2139,151 @@ struct mmq_args {
    int64_t ne0;
 };

-constexpr int mmq_get_nwarps(int mmq_x) {
-    return mmq_x >= 32 ? 8 : 4;
-}
-
 static int mmq_get_shmem(const ggml_type type, const int mmq_x, const int mmq_y) {
    const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
-    const int nwarps = mmq_get_nwarps(mmq_x);

    const int shmem_x = txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
    const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
-    return shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
+    return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
 }

-template <ggml_type type, int mmq_x, int nwarps>
-static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
+template <ggml_type type, int mmq_x>
+static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
    const int id = ggml_cuda_get_device();
    const int cc = ggml_cuda_info().devices[id].cc;
-    const int mmq_y = get_mmq_y_host(cc, mmq_x);
+    const int nsm = ggml_cuda_info().devices[id].nsm;
+    const int mmq_y = get_mmq_y_host(cc);

-    const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y;
-    const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+    const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1);

    const int shmem = mmq_get_shmem(type, mmq_x, mmq_y);

 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
    static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
    if (!shmem_limit_raised[id]) {
-        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
-        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
        shmem_limit_raised[id] = true;
    }
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))

+    const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
+    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
+    const dim3 block_nums_xy_tiling(nty, ntx, 1);
+
+    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
+    if (!use_stream_k) {
+        if (args.ne01 % mmq_y == 0) {
+            constexpr bool need_check = false;
+            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
+                (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        } else {
+            constexpr bool need_check = true;
+            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
+                (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        }
+        return;
+    }
+
+    const dim3 block_nums_mmq(nsm, 1, 1);
+
+    ggml_cuda_pool & pool = ctx.pool();
+    ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
+
    if (args.ne01 % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        constexpr bool need_check = false;
+
+        mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+
+        mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
+            (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
    } else {
-        const bool need_check = true;
-        mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        constexpr bool need_check = true;
+
+        mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+
+        mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
+            (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
    }
 }

 template <ggml_type type>
-void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
+void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
    const int id    = ggml_cuda_get_device();
    const int nsm   = ggml_cuda_info().devices[id].nsm;
    const int cc    = ggml_cuda_info().devices[id].cc;
    const int smpbo = ggml_cuda_info().devices[id].smpbo;

    const int mmq_x_max = get_mmq_x_max_host(cc);
-    const int mmq_y = get_mmq_y_host(cc, mmq_x_max);
+    const int mmq_y = get_mmq_y_host(cc);
    const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
+    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;

    int mmq_x_best  = 0;
-    int nwaves_best = INT_MAX;
+    int nparts_best = INT_MAX;

-    for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) {
-        const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x;
-        const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm;
+    for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) {
+        const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x;
+        const int nwaves_xy_tiling = ntiles_x*block_num_y;

-        if (nwaves < nwaves_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) {
+        const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling;
+
+        if (nparts < nparts_best && mmq_get_shmem(type, mmq_x, mmq_y) <= smpbo) {
            mmq_x_best  = mmq_x;
-            nwaves_best = nwaves;
+            nparts_best = nparts;
        }
    }

    switch (mmq_x_best) {
        case   8:
-            launch_mul_mat_q<type,   8, mmq_get_nwarps(  8)>(args, stream);
+            launch_mul_mat_q<type,   8>(ctx, args, stream);
            break;
        case  16:
-            launch_mul_mat_q<type,  16, mmq_get_nwarps( 16)>(args, stream);
+            launch_mul_mat_q<type,  16>(ctx, args, stream);
            break;
        case  24:
-            launch_mul_mat_q<type,  24, mmq_get_nwarps( 24)>(args, stream);
+            launch_mul_mat_q<type,  24>(ctx, args, stream);
            break;
        case  32:
-            launch_mul_mat_q<type,  32, mmq_get_nwarps( 32)>(args, stream);
+            launch_mul_mat_q<type,  32>(ctx, args, stream);
            break;
        case  40:
-            launch_mul_mat_q<type,  40, mmq_get_nwarps( 40)>(args, stream);
+            launch_mul_mat_q<type,  40>(ctx, args, stream);
            break;
        case  48:
-            launch_mul_mat_q<type,  48, mmq_get_nwarps( 48)>(args, stream);
+            launch_mul_mat_q<type,  48>(ctx, args, stream);
            break;
        case  56:
-            launch_mul_mat_q<type,  56, mmq_get_nwarps( 56)>(args, stream);
+            launch_mul_mat_q<type,  56>(ctx, args, stream);
            break;
        case  64:
-            launch_mul_mat_q<type,  64, mmq_get_nwarps( 64)>(args, stream);
+            launch_mul_mat_q<type,  64>(ctx, args, stream);
            break;
        case  72:
-            launch_mul_mat_q<type,  72, mmq_get_nwarps( 72)>(args, stream);
+            launch_mul_mat_q<type,  72>(ctx, args, stream);
            break;
        case  80:
-            launch_mul_mat_q<type,  80, mmq_get_nwarps( 80)>(args, stream);
+            launch_mul_mat_q<type,  80>(ctx, args, stream);
            break;
        case  88:
-            launch_mul_mat_q<type,  88, mmq_get_nwarps( 88)>(args, stream);
+            launch_mul_mat_q<type,  88>(ctx, args, stream);
            break;
        case  96:
-            launch_mul_mat_q<type,  96, mmq_get_nwarps( 96)>(args, stream);
+            launch_mul_mat_q<type,  96>(ctx, args, stream);
            break;
        case 104:
-            launch_mul_mat_q<type, 104, mmq_get_nwarps(104)>(args, stream);
+            launch_mul_mat_q<type, 104>(ctx, args, stream);
            break;
        case 112:
-            launch_mul_mat_q<type, 112, mmq_get_nwarps(112)>(args, stream);
+            launch_mul_mat_q<type, 112>(ctx, args, stream);
            break;
        case 120:
-            launch_mul_mat_q<type, 120, mmq_get_nwarps(120)>(args, stream);
+            launch_mul_mat_q<type, 120>(ctx, args, stream);
            break;
        case 128:
-            launch_mul_mat_q<type, 128, mmq_get_nwarps(128)>(args, stream);
+            launch_mul_mat_q<type, 128>(ctx, args, stream);
            break;
        default:
            fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
@@ -2114,7 +2293,7 @@ void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
 }

 #define DECL_MMQ_CASE(type)                                                        \
-    template void mul_mat_q_case<type>(const mmq_args & args, cudaStream_t stream) \
+    template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \

 extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
 extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
@@ -117,7 +117,7 @@ static __global__ void mul_mat_vec_q(
            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
        }

-        if (threadIdx.x < rows_per_cuda_block) {
+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
@@ -92,6 +92,15 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] * x[i];
 }

+static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sqrtf(x[i]);
+}
+
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -142,6 +151,11 @@ static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }

+static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
+    sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
@@ -284,3 +298,17 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
+
+void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
@@ -8,6 +8,7 @@
 #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_SQRT_BLOCK_SIZE 256

 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

@@ -28,3 +29,5 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -17,7 +17,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

-#if defined(_WIN32)
+#if defined(_MSC_VER)

 #define m512bh(p) p
 #define m512i(p) p
@@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
 }

 static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
+    for (size_t i = 0, n = 3; i < n; ++i) {
+        if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
+            return false;
+        }
+    }
+
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
@@ -73,9 +73,13 @@ struct rpc_tensor {
    uint64_t view_offs;
    uint64_t data;
    char name[GGML_MAX_NAME];
+
+    char padding[4];
 };
 #pragma pack(pop)

+static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
+
 // RPC commands
 enum rpc_cmd {
    ALLOC_BUFFER = 0,
@@ -599,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
    int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
    output.resize(output_size, 0);
    memcpy(output.data(), &n_nodes, sizeof(n_nodes));
-    uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
    for (uint32_t i = 0; i < n_nodes; i++) {
-        out_nodes[i] = reinterpret_cast<uint64_t>(cgraph->nodes[i]);
+        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
    }
    uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
    *out_ntensors = n_tensors;
@@ -1036,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    }
    std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
    for (uint32_t i = 0; i < n_nodes; i++) {
-        graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
+        int64_t id;
+        memcpy(&id, &nodes[i], sizeof(id));
+        graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
    }
    ggml_status status = ggml_backend_graph_compute(backend, graph);
    // output serialization format: | status (1 byte) |
@@ -8,14 +8,12 @@

 #include "ggml.h"
 #include "ggml-backend.h"
+#include "ggml-sycl/presets.hpp"

 #ifdef  __cplusplus
 extern "C" {
 #endif

-#define GGML_SYCL_MAX_DEVICES       48
-#define GGML_SYCL_NAME "SYCL"
-
 // backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);

@@ -33,13 +31,6 @@ GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
-GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
-
-// TODO: these are temporary
-//       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
-GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
-GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
-GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();

 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
@@ -0,0 +1,23 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_BACKEND_HPP
+#define GGML_SYCL_BACKEND_HPP
+
+#include "common.hpp"
+#include "convert.hpp"
+#include "dequantize.hpp"
+#include "dmmv.hpp"
+#include "mmq.hpp"
+#include "mmvq.hpp"
+
+#endif // GGML_SYCL_BACKEND_HPP
@@ -0,0 +1,53 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "common.hpp"
+
+int get_current_device_id() {
+  return dpct::dev_mgr::instance().current_device_id();
+}
+
+void* ggml_sycl_host_malloc(size_t size) try {
+  if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
+    return nullptr;
+  }
+
+  void* ptr = nullptr;
+  // allow to use dpct::get_in_order_queue() for host malloc
+  dpct::err0 err = CHECK_TRY_ERROR(
+      ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue()));
+
+  if (err != 0) {
+    // clear the error
+    fprintf(
+        stderr,
+        "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+        size / 1024.0 / 1024.0,
+        "syclGetErrorString is not supported");
+    return nullptr;
+  }
+
+  return ptr;
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_sycl_host_free(void* ptr) try {
+  // allow to use dpct::get_in_order_queue() for host malloc
+  SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
@@ -0,0 +1,298 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_COMMON_HPP
+#define GGML_SYCL_COMMON_HPP
+
+#include <fstream>
+#include <iostream>
+
+#include "dpct/helper.hpp"
+#include "presets.hpp"
+
+#define GGML_COMMON_DECL_SYCL
+#define GGML_COMMON_IMPL_SYCL
+#include "ggml-common.h"
+
+void* ggml_sycl_host_malloc(size_t size);
+void ggml_sycl_host_free(void* ptr);
+
+static int g_ggml_sycl_debug = 0;
+#define GGML_SYCL_DEBUG(...)        \
+  do {                              \
+    if (g_ggml_sycl_debug)          \
+      fprintf(stderr, __VA_ARGS__); \
+  } while (0)
+
+#define CHECK_TRY_ERROR(expr)                                            \
+  [&]() {                                                                \
+    try {                                                                \
+      expr;                                                              \
+      return dpct::success;                                              \
+    } catch (std::exception const& e) {                                  \
+      std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
+                << ", line:" << __LINE__ << ", func:" << __func__        \
+                << std::endl;                                            \
+      return dpct::default_error;                                        \
+    }                                                                    \
+  }()
+
+// #define DEBUG_SYCL_MALLOC
+
+static int g_work_group_size = 0;
+// typedef sycl::half ggml_fp16_t;
+
+#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
+#define VER_4VEC 610 // todo for hardward optimize.
+#define VER_GEN9 700 // todo for hardward optimize.
+#define VER_GEN12 1000000 // todo for hardward optimize.
+#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
+
+#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
+
+// define for XMX in Intel GPU
+// TODO: currently, it's not used for XMX really.
+#if !defined(GGML_SYCL_FORCE_MMQ)
+    #define SYCL_USE_XMX
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244 4267) // possible loss of data
+#endif
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_SYCL_DMMV_X
+#define GGML_SYCL_DMMV_X 32
+#endif
+#ifndef GGML_SYCL_MMV_Y
+#define GGML_SYCL_MMV_Y 1
+#endif
+
+typedef sycl::queue *queue_ptr;
+
+enum ggml_sycl_backend_gpu_mode {
+  SYCL_UNSET_GPU_MODE = -1,
+  SYCL_SINGLE_GPU_MODE = 0,
+  SYCL_MUL_GPU_MODE
+};
+
+static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+static void crash() {
+  int* ptr = NULL;
+  *ptr = 0;
+}
+
+[[noreturn]] static void ggml_sycl_error(
+    const char* stmt,
+    const char* func,
+    const char* file,
+    const int line,
+    const char* msg) {
+  fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
+  fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
+  GGML_ASSERT(!"SYCL error");
+}
+
+#define SYCL_CHECK(err)                     \
+  do {                                      \
+    auto err_ = (err);                      \
+    if (err_ != 0)                          \
+      ggml_sycl_error(                      \
+          #err,                             \
+          __func__,                         \
+          __FILE__,                         \
+          __LINE__,                         \
+          "Meet error in this line code!"); \
+  } while (0)
+
+#if DPCT_COMPAT_RT_VERSION >= 11100
+#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_SYCL_ASSUME(x)
+#endif // DPCT_COMPAT_RT_VERSION >= 11100
+
+#ifdef GGML_SYCL_F16
+typedef sycl::half dfloat; // dequantize float
+typedef sycl::half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef sycl::float2 dfloat2;
+#endif // GGML_SYCL_F16
+
+#define MMVQ_MAX_BATCH_SIZE  8
+
+static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+static int g_all_sycl_device_count = -1;
+static bool g_ggml_backend_sycl_buffer_type_initialized = false;
+
+static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
+    SYCL_UNSET_GPU_MODE;
+
+static void* g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
+  stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
+                "current GPU architecture.\n";
+  // __trap();
+  std::exit(1);
+
+  (void)bad_arch; // suppress unused function warning
+}
+
+int get_current_device_id();
+
+inline dpct::err0 ggml_sycl_set_device(const int device) try {
+
+  int current_device_id;
+  SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
+
+  // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
+  // current_device_id=%d\n", device, current_device);
+  if (device == current_device_id) {
+    return 0;
+  }
+
+  return CHECK_TRY_ERROR(dpct::select_device(device));
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  crash();
+  std::exit(1);
+}
+
+//////////////////////
+
+struct ggml_sycl_device_info {
+    int device_count;
+
+    struct sycl_device_info {
+        int     cc;                 // compute capability
+        // int     nsm;                // number of streaming multiprocessors
+        // size_t  smpb;               // max. shared memory per block
+        bool    vmm;                // virtual memory support
+        size_t  total_vram;
+    };
+
+    sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
+
+    std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
+};
+
+const ggml_sycl_device_info & ggml_sycl_info();
+
+struct ggml_sycl_pool {
+    virtual ~ggml_sycl_pool() = default;
+
+    virtual void * alloc(size_t size, size_t * actual_size) = 0;
+    virtual void free(void * ptr, size_t size) = 0;
+};
+
+template<typename T>
+struct ggml_sycl_pool_alloc {
+    ggml_sycl_pool * pool = nullptr;
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
+    }
+
+    ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
+        alloc(size);
+    }
+
+    ~ggml_sycl_pool_alloc() {
+        if (ptr != nullptr) {
+            pool->free(ptr, actual_size);
+        }
+    }
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(pool != nullptr);
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    T * alloc(ggml_sycl_pool & pool, size_t size) {
+        this->pool = &pool;
+        return alloc(size);
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    ggml_sycl_pool_alloc() = default;
+    ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
+    ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
+    ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
+    ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
+};
+
+// backend interface
+
+struct ggml_tensor_extra_gpu {
+  void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
+                                       // tensors
+  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
+                        [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+struct ggml_backend_sycl_context {
+    int device;
+    std::string name;
+
+    queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
+
+    explicit ggml_backend_sycl_context(int device) :
+        device(device),
+        name(GGML_SYCL_NAME + std::to_string(device)) {
+    }
+
+    queue_ptr stream(int device, int stream) {
+        if (qptrs[device][stream] == nullptr) {
+            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
+        }
+        return qptrs[device][stream];
+    }
+
+    queue_ptr stream() {
+        return stream(device, 0);
+    }
+
+    // pool
+    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
+
+    static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
+
+    ggml_sycl_pool & pool(int device) {
+        if (pools[device] == nullptr) {
+            pools[device] = new_pool_for_device(stream(device,0), device);
+        }
+        return *pools[device];
+    }
+
+    ggml_sycl_pool & pool() {
+        return pool(device);
+    }
+};
+
+
+#endif // GGML_SYCL_COMMON_HPP
@@ -0,0 +1,544 @@
+#include "convert.hpp"
+#include "dequantize.hpp"
+#include "presets.hpp"
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
+                             const sycl::nd_item<3> &item_ct1) {
+    const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                       item_ct1.get_local_id(2));
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0] = v.x();
+    y[iybs + iqs + y_offset] = v.y();
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_sycl(const void *__restrict__ vx,
+                                  dst_t *__restrict__ y, const int k,
+                                  dpct::queue_ptr stream) {
+    const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_0(vx, y, nb32, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_1(vx, y, nb32, item_ct1);
+                             });
+    }
+}
+
+
+template <typename dst_t>
+static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_s(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_m(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xxs(
+                                     vx, y, item_ct1, iq2xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
+                                       dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xs(
+                                     vx, y, item_ct1, iq2xs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
+                                      dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_s(vx, y, item_ct1);
+                             });
+        });
+    }
+}
+
+
+template <typename dst_t>
+static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_xxs(
+                                     vx, y, item_ct1, iq3xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_s(
+                                     vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
+                                       dpct::queue_ptr stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+#if QK_K == 64
+    dequantize_row_iq4_nl_sycl(vx, y, k, stream);
+#else
+      {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_xs(vx, y, item_ct1);
+                      });
+            });
+      }
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
+                                       dpct::queue_ptr stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+      {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_nl(vx, y, item_ct1);
+                      });
+            });
+      }
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    const src_t * x = (src_t *) vx;
+
+    y[i] = x[i];
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary_sycl(const void *__restrict__ vx,
+                               dst_t *__restrict__ y, const int k,
+                               dpct::queue_ptr stream) {
+    const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                convert_unary<src_t>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_sycl;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_sycl;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_sycl;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_sycl;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_sycl;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_sycl;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_sycl;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_sycl;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_sycl;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_sycl;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_sycl;
+        case GGML_TYPE_F32:
+            return convert_unary_sycl<float>;
+        default:
+            return nullptr;
+    }
+}
+
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_sycl;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_sycl;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_sycl;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_sycl;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_sycl;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_sycl;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_sycl;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_sycl;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_sycl;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_sycl;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_sycl;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_sycl;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_sycl;
+        case GGML_TYPE_F16:
+            return convert_unary_sycl<sycl::half>;
+        default:
+            return nullptr;
+    }
+}
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONVERT_HPP
+#define GGML_SYCL_CONVERT_HPP
+
+#include "common.hpp"
+
+template <typename T>
+using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+                             int k, dpct::queue_ptr stream);
+typedef to_t_sycl_t<float> to_fp32_sycl_t;
+typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
+
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
+
+#endif // GGML_SYCL_CONVERT_HPP
@@ -0,0 +1,690 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_DEQUANTIZE_HPP
+#define GGML_SYCL_DEQUANTIZE_HPP
+
+#include "common.hpp"
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+
+static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_SYCL_F16
+    // v = v - {8.0f, 8.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 8.0f) * d;
+    v.s1() = (v.s1() - 8.0f) * d;
+
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = (v.s0() * d) + m;
+    v.s1() = (v.s1() * d) + m;
+
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_SYCL_F16
+    // v = v - {16.0f, 16.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 16.0f) * d;
+    v.s1() = (v.s1() - 16.0f) * d;
+
+#else
+    v.x() = (v.x() - 16.0f) * d;
+    v.y() = (v.y() - 16.0f) * d;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = (v.s0() * d) + m;
+    v.s1() = (v.s1() * d) + m;
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x() = x[ib].qs[iqs + 0];
+    v.y() = x[ib].qs[iqs + 1];
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    v.s0() *= d;
+    v.s1() *= d;
+#else
+    v.x() *= d;
+    v.y() *= d;
+#endif // GGML_SYCL_F16
+}
+
+template<typename dst_t>
+static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
+    const float d = sycl::vec<sycl::half, 1>(x->d)
+                        .convert<float, sycl::rounding_mode::automatic>()[0];
+    const float dm = -8*d;
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d * (q[l] & 0xF) + dm;
+        y[l+16] = d * (q[l] >>  4) + dm;
+    }
+}
+
+template<typename dst_t>
+static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
+    const sycl::float2 d =
+        x->dm.convert<float, sycl::rounding_mode::automatic>();
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l + 0] = d.x() * (q[l] & 0xF) + d.y();
+        y[l + 16] = d.x() * (q[l] >> 4) + d.y();
+    }
+}
+
+
+//================================== k-quants
+
+template<typename dst_t>
+static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = item_ct1.get_local_id(2) / 4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = item_ct1.get_local_id(2);
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = item_ct1.get_local_id(2);
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = item_ct1.get_local_id(2);
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                     const sycl::nd_item<3> &item_ct1,
+                                     const uint64_t *iq2xxs_grid_ptr,
+                                     const uint8_t *ksigns_iq2xs_ptr,
+                                     const uint8_t *kmask_iq2xs_ptr) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                    const sycl::nd_item<3> &item_ct1,
+                                    const uint64_t *iq2xs_grid,
+                                    const uint8_t *ksigns_iq2xs,
+                                    const uint8_t *kmask_iq2xs) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq2_s * x = (const block_iq2_s *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
+#pragma unroll
+    for (int j = 0; j < 8; ++j)
+        y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                     const sycl::nd_item<3> &item_ct1,
+                                     const uint32_t *iq3xxs_grid,
+                                     const uint8_t *ksigns_iq2xs,
+                                     const uint8_t *kmask_iq2xs) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq3_s * x = (const block_iq3_s *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * qs = x[i].qs + 8*ib;
+    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
+    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
+    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
+    const uint8_t signs = x[i].signs[4*ib + il];
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint32_t *iq1s_grid_gpu) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq1_s * x = (const block_iq1_s  *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint32_t *iq1s_grid_gpu) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
+
+    const int tid = item_ct1.get_local_id(2);
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = (float)x[ib].d;
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+
+}
+
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                        const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_group(2);
+    const block_iq4_xs * x = (const block_iq4_xs *)vx;
+
+    const int tid = item_ct1.get_local_id(2);
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
+    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+
+#endif // GGML_SYCL_DEQUANTIZE_HPP
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_DMMV_HPP
+#define GGML_SYCL_DMMV_HPP
+
+#include "common.hpp"
+
+
+void ggml_sycl_op_dequantize_mul_mat_vec(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+
+#endif // GGML_SYCL_DMMV_HPP
@@ -0,0 +1,33 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_MMQ_HPP
+#define GGML_SYCL_MMQ_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_mul_mat_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1,
+    ggml_tensor* dst,
+    const char* src0_dd_i,
+    const float* src1_ddf_i,
+    const char* src1_ddq_i,
+    float* dst_dd_i,
+    const int64_t row_low,
+    const int64_t row_high,
+    const int64_t src1_ncols,
+    const int64_t src1_padded_row_size,
+    const dpct::queue_ptr& stream);
+
+#endif // GGML_SYCL_MMQ_HPP
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_MMVQ_HPP
+#define GGML_SYCL_MMVQ_HPP
+
+#include "common.hpp"
+
+
+void ggml_sycl_op_mul_mat_vec_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+
+#endif // GGML_SYCL_MMVQ_HPP
@@ -0,0 +1,67 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_PRESETS_HPP
+#define GGML_SYCL_PRESETS_HPP
+
+#define GGML_SYCL_MAX_STREAMS       8
+#define GGML_SYCL_MAX_BUFFERS       256
+#define GGML_SYCL_MAX_DEVICES       48
+#define GGML_SYCL_NAME "SYCL"
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define SYCL_GELU_BLOCK_SIZE 256
+#define SYCL_SILU_BLOCK_SIZE 256
+#define SYCL_TANH_BLOCK_SIZE 256
+#define SYCL_RELU_BLOCK_SIZE 256
+#define SYCL_HARDSIGMOID_BLOCK_SIZE 256
+#define SYCL_HARDSWISH_BLOCK_SIZE 256
+#define SYCL_SQR_BLOCK_SIZE 256
+#define SYCL_CPY_BLOCK_SIZE 32
+#define SYCL_SCALE_BLOCK_SIZE 256
+#define SYCL_CLAMP_BLOCK_SIZE 256
+#define SYCL_ROPE_BLOCK_SIZE 256
+#define SYCL_ALIBI_BLOCK_SIZE 32
+#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
+#define SYCL_QUANTIZE_BLOCK_SIZE 256
+#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
+#define SYCL_GET_ROWS_BLOCK_SIZE 256
+#define SYCL_UPSCALE_BLOCK_SIZE 256
+#define SYCL_CONCAT_BLOCK_SIZE 256
+#define SYCL_PAD_BLOCK_SIZE 256
+#define SYCL_ACC_BLOCK_SIZE 256
+#define SYCL_IM2COL_BLOCK_SIZE 256
+#define SYCL_POOL2D_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_SYCL_DMMV_X
+#define GGML_SYCL_DMMV_X 32
+#endif
+#ifndef GGML_SYCL_MMV_Y
+#define GGML_SYCL_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
+#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#endif // GGML_SYCL_PRESETS_HPP
@@ -1753,9 +1753,8 @@ struct ggml_compute_state_shared {
    int n_threads;

    // synchronization primitives
-    atomic_int n_active;  // num active threads
-    atomic_int node_n;    // active graph node
-    atomic_int node_task; // active graph node task phase
+    atomic_int n_barrier;
+    atomic_int n_barrier_passed;

    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
    void* abort_callback_data;
@@ -18972,47 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
    return n_tasks;
 }

-static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_node_n = * node_n;
+#ifdef GGML_USE_OPENMP
+static void ggml_barrier(struct ggml_compute_state * state) {
+    if (state->shared->n_threads == 1) {
+        return;
+    }

-    while (true) {
-        if (do_yield) {
+    #pragma omp barrier
+}
+#else
+static void ggml_barrier(struct ggml_compute_state * state) {
+    if (state->shared->n_threads == 1) {
+        return;
+    }
+
+    atomic_int * n_barrier = &state->shared->n_barrier;
+    atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
+
+    int n_threads = state->shared->n_threads;
+    int passed_old = atomic_load(n_barrier_passed);
+
+    if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
+        // last thread
+        atomic_store(n_barrier, 0);
+        atomic_fetch_add(n_barrier_passed, 1);
+    } else {
+        // wait for other threads
+        //while (atomic_load(n_barrier_passed) == passed_old) {
+        //}
+        const int n_spin_before_sleep = 100000;
+        while (true) {
+            for (int i = 0; i < n_spin_before_sleep; i++) {
+                if (atomic_load(n_barrier_passed) != passed_old) {
+                    return;
+                }
+            #if defined(__SSE3__)
+                _mm_pause();
+            #endif
+            }
            sched_yield();
        }
-
-        *node_n = atomic_load(&state->shared->node_n);
-        if (*node_n != last_node_n) {
-            break;
-        }
-
-#if defined(__SSE3__)
-        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
-        _mm_pause();
-#endif
    }
 }
-
-static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_task_phase = *task_phase;
-
-    while (true) {
-        if (do_yield) {
-            sched_yield();
-        }
-
-        *task_phase = atomic_load(&state->shared->node_task);
-        if (*task_phase != last_task_phase) {
-            break;
-        }
-
-#if defined(__SSE3__)
-        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
-        _mm_pause();
 #endif
-    }
-}

 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@@ -19020,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    const struct ggml_cgraph * cgraph = state->shared->cgraph;
    const struct ggml_cplan  * cplan  = state->shared->cplan;

-    const int   n_threads   = state->shared->n_threads;
+    const int ith       = state->ith;
+    const int n_threads = state->shared->n_threads;

-    set_numa_thread_affinity(state->ith);
+    set_numa_thread_affinity(ith);

-    int node_n     = -1;
-    int task_phase = GGML_TASK_TYPE_FINALIZE;
+    struct ggml_compute_params params = {
+        /*.type  =*/ GGML_TASK_TYPE_INIT,
+        /*.ith   =*/ ith,
+        /*.nth   =*/ state->shared->n_threads,
+        /*.wsize =*/ cplan->work_size,
+        /*.wdata =*/ cplan->work_data,
+    };

-    while (true) {
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
            state->ec = GGML_STATUS_ABORTED;
            return 0;
        }

-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            // all other threads are finished and spinning
-            // do finalize and init here so we don't have synchronize again
-            struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
-            };
-
-            if (node_n != -1) {
-                /* FINALIZE */
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-                    ggml_compute_forward(&params, node, state);
-                }
-                ggml_graph_compute_perf_stats_node(node, state->shared);
-            }
-
-            // distribute new work or execute it direct if 1T
-            while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-
-                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
-                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-
-                params.nth = n_tasks;
-
-                if (n_tasks == 1) {
-                    /* INIT */
-                    if (GGML_OP_HAS_INIT[node->op]) {
-                        params.type = GGML_TASK_TYPE_INIT;
-                        ggml_compute_forward(&params, node, state);
-                    }
-
-                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
-                    // they do something more efficient than spinning (?)
-                    params.type = GGML_TASK_TYPE_COMPUTE;
-                    ggml_compute_forward(&params, node, state);
-
-                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-                        params.type = GGML_TASK_TYPE_FINALIZE;
-                        ggml_compute_forward(&params, node, state);
-                    }
-
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
-                } else {
-                    break;
-                }
-
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
-            }
-
-            task_phase = GGML_TASK_TYPE_INIT;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_n,    node_n);
-            atomic_store(&state->shared->node_task, task_phase);
-        } else {
-            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
-        }
-
-        // check if we should stop
-        if (node_n >= cgraph->n_nodes) break;
-
-        /* INIT & COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);

-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_TYPE_INIT,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
-        };
+        params.nth = n_tasks;

-        if (state->ith < n_tasks) {
-            if (GGML_OP_HAS_INIT[node->op]) {
+        /* INIT */
+        if (GGML_OP_HAS_INIT[node->op]) {
+            if (ith < n_tasks) {
+                params.type = GGML_TASK_TYPE_INIT;
                ggml_compute_forward(&params, node, state);
            }
+            ggml_barrier(state);
        }

-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_TYPE_COMPUTE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-            //       depending on the workload and the operating system.
-            //       since it is not clear what is the best approach, it should potentially become user-configurable
-            //       ref: https://github.com/ggerganov/ggml/issues/291
-            // UPD:  adding the do_yield flag seems to resolve the issue universally
-            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
-            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
-        }
-
-        if (state->ith < n_tasks) {
+        /* COMPUTE */
+        if (ith < n_tasks) {
            params.type = GGML_TASK_TYPE_COMPUTE;
            ggml_compute_forward(&params, node, state);
        }

-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_TYPE_FINALIZE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+        ggml_barrier(state);
+
+        /* FINALIZE */
+        if (GGML_OP_HAS_FINALIZE[node->op]) {
+            if (params.ith == 0) {
+                params.type = GGML_TASK_TYPE_FINALIZE;
+                ggml_compute_forward(&params, node, state);
+            }
+            ggml_barrier(state);
        }
    }

@@ -19336,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
                // update the number of threads from the actual number of threads that we got from OpenMP
                n_threads = omp_get_num_threads();
                workers[0].shared->n_threads = n_threads;
-                workers[0].shared->n_active  = n_threads;
            }
            ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
        }
@@ -19399,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        /*.perf_node_start_cycles  =*/ 0,
        /*.perf_node_start_time_us =*/ 0,
        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-        /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
+        /*.n_barrier               =*/ 0,
+        /*.n_barrier_passed        =*/ 0,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
        /*.current_chunk;          =*/ 0,
@@ -312,6 +312,12 @@
    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)

+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -33,21 +33,22 @@ class Keys:
        FILE_TYPE            = "general.file_type"

    class LLM:
-        VOCAB_SIZE                 = "{arch}.vocab_size"
-        CONTEXT_LENGTH             = "{arch}.context_length"
-        EMBEDDING_LENGTH           = "{arch}.embedding_length"
-        BLOCK_COUNT                = "{arch}.block_count"
-        LEADING_DENSE_BLOCK_COUNT  = "{arch}.leading_dense_block_count"
-        FEED_FORWARD_LENGTH        = "{arch}.feed_forward_length"
-        EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
-        USE_PARALLEL_RESIDUAL      = "{arch}.use_parallel_residual"
-        TENSOR_DATA_LAYOUT         = "{arch}.tensor_data_layout"
-        EXPERT_COUNT               = "{arch}.expert_count"
-        EXPERT_USED_COUNT          = "{arch}.expert_used_count"
-        EXPERT_SHARED_COUNT        = "{arch}.expert_shared_count"
-        EXPERT_WEIGHTS_SCALE       = "{arch}.expert_weights_scale"
-        POOLING_TYPE               = "{arch}.pooling_type"
-        LOGIT_SCALE                = "{arch}.logit_scale"
+        VOCAB_SIZE                        = "{arch}.vocab_size"
+        CONTEXT_LENGTH                    = "{arch}.context_length"
+        EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        BLOCK_COUNT                       = "{arch}.block_count"
+        LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
+        FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
+        EXPERT_FEED_FORWARD_LENGTH        = "{arch}.expert_feed_forward_length"
+        EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
+        USE_PARALLEL_RESIDUAL             = "{arch}.use_parallel_residual"
+        TENSOR_DATA_LAYOUT                = "{arch}.tensor_data_layout"
+        EXPERT_COUNT                      = "{arch}.expert_count"
+        EXPERT_USED_COUNT                 = "{arch}.expert_used_count"
+        EXPERT_SHARED_COUNT               = "{arch}.expert_shared_count"
+        EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
+        POOLING_TYPE                      = "{arch}.pooling_type"
+        LOGIT_SCALE                       = "{arch}.logit_scale"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@@ -394,6 +394,9 @@ class GGUFWriter:
    def add_expert_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)

+    def add_expert_shared_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
    def add_parallel_residual(self, use: bool) -> None:
        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

@@ -14,7 +14,7 @@ import numpy as np
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
    sys.path.insert(0, str(Path(__file__).parent.parent))

-from gguf import GGUFReader, GGUFValueType  # noqa: E402
+from gguf import GGUFReader, GGUFValueType, ReaderTensor  # noqa: E402

 logger = logging.getLogger("gguf-dump")

@@ -101,25 +101,285 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
    json.dump(result, sys.stdout)


+def markdown_table_with_alignment_support(header_map: list[dict[str, str]], data: list[dict[str, Any]]):
+    # JSON to Markdown table formatting: https://stackoverflow.com/a/72983854/2850957
+
+    # Alignment Utility Function
+    def strAlign(padding: int, alignMode: str | None, strVal: str):
+        if alignMode == 'center':
+            return strVal.center(padding)
+        elif alignMode == 'right':
+            return strVal.rjust(padding - 1) + ' '
+        elif alignMode == 'left':
+            return ' ' + strVal.ljust(padding - 1)
+        else: # default left
+            return ' ' + strVal.ljust(padding - 1)
+
+    def dashAlign(padding: int, alignMode: str | None):
+        if alignMode == 'center':
+            return ':' + '-' * (padding - 2) + ':'
+        elif alignMode == 'right':
+            return '-' * (padding - 1) + ':'
+        elif alignMode == 'left':
+            return ':' + '-' * (padding - 1)
+        else: # default left
+            return '-' * (padding)
+
+    # Calculate Padding For Each Column Based On Header and Data Length
+    rowsPadding = {}
+    for index, columnEntry in enumerate(header_map):
+        padCount = max([len(str(v)) for d in data for k, v in d.items() if k == columnEntry['key_name']], default=0) + 2
+        headerPadCount = len(columnEntry['header_name']) + 2
+        rowsPadding[index] = headerPadCount if padCount <= headerPadCount else padCount
+
+    # Render Markdown Header
+    rows = []
+    rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(columnEntry['header_name'])) for index, columnEntry in enumerate(header_map)))
+    rows.append('|'.join(dashAlign(rowsPadding[index], columnEntry.get('align')) for index, columnEntry in enumerate(header_map)))
+
+    # Render Tabular Data
+    for item in data:
+        rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(item[columnEntry['key_name']])) for index, columnEntry in enumerate(header_map)))
+
+    # Convert Tabular String Rows Into String
+    tableString = ""
+    for row in rows:
+        tableString += f'|{row}|\n'
+
+    return tableString
+
+
+def element_count_rounded_notation(count: int) -> str:
+    if count > 1e15 :
+        # Quadrillion
+        scaled_amount = count * 1e-15
+        scale_suffix = "Q"
+    elif count > 1e12 :
+        # Trillions
+        scaled_amount = count * 1e-12
+        scale_suffix = "T"
+    elif count > 1e9 :
+        # Billions
+        scaled_amount = count * 1e-9
+        scale_suffix = "B"
+    elif count > 1e6 :
+        # Millions
+        scaled_amount = count * 1e-6
+        scale_suffix = "M"
+    elif count > 1e3 :
+        # Thousands
+        scaled_amount = count * 1e-3
+        scale_suffix = "K"
+    else:
+        # Under Thousands
+        scaled_amount = count
+        scale_suffix = ""
+    return f"{'~' if count > 1e3 else ''}{round(scaled_amount)}{scale_suffix}"
+
+
+def translate_tensor_name(name):
+    words = name.split(".")
+
+    # Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names
+    abbreviation_dictionary = {
+        'token_embd': 'Token embedding',
+        'pos_embd': 'Position embedding',
+        'output_norm': 'Output normalization',
+        'output': 'Output',
+        'attn_norm': 'Attention normalization',
+        'attn_norm_2': 'Attention normalization',
+        'attn_qkv': 'Attention query-key-value',
+        'attn_q': 'Attention query',
+        'attn_k': 'Attention key',
+        'attn_v': 'Attention value',
+        'attn_output': 'Attention output',
+        'ffn_norm': 'Feed-forward network normalization',
+        'ffn_up': 'Feed-forward network "up"',
+        'ffn_gate': 'Feed-forward network "gate"',
+        'ffn_down': 'Feed-forward network "down"',
+        'ffn_gate_inp': 'Expert-routing layer for the Feed-forward network in Mixture of Expert models',
+        'ffn_gate_exp': 'Feed-forward network "gate" layer per expert in Mixture of Expert models',
+        'ffn_down_exp': 'Feed-forward network "down" layer per expert in Mixture of Expert models',
+        'ffn_up_exp': 'Feed-forward network "up" layer per expert in Mixture of Expert models',
+        'ssm_in': 'State space model input projections',
+        'ssm_conv1d': 'State space model rolling/shift',
+        'ssm_x': 'State space model selective parametrization',
+        'ssm_a': 'State space model state compression',
+        'ssm_d': 'State space model skip connection',
+        'ssm_dt': 'State space model time step',
+        'ssm_out': 'State space model output projection',
+        'blk': 'Block'
+    }
+
+    expanded_words = []
+    for word in words:
+        word_norm = word.strip().lower()
+        if word_norm in abbreviation_dictionary:
+            expanded_words.append(abbreviation_dictionary[word_norm].title())
+        else:
+            expanded_words.append(word.title())
+
+    return ' '.join(expanded_words)
+
+
+def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
+    host_endian, file_endian = get_file_host_endian(reader)
+    markdown_content = ""
+    markdown_content += f'# {args.model} - GGUF Internal File Dump\n\n'
+    markdown_content += f'- Endian: {file_endian} endian\n'
+    markdown_content += '\n'
+    markdown_content += '## Key Value Metadata Store\n\n'
+    markdown_content += f'There are {len(reader.fields)} key-value pairs in this file\n'
+    markdown_content += '\n'
+
+    kv_dump_table: list[dict[str, str | int]] = []
+    for n, field in enumerate(reader.fields.values(), 1):
+        if not field.types:
+            pretty_type = 'N/A'
+        elif field.types[0] == GGUFValueType.ARRAY:
+            nest_count = len(field.types) - 1
+            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
+        else:
+            pretty_type = str(field.types[-1].name)
+
+        total_elements = len(field.data)
+        value = ""
+        if len(field.types) == 1:
+            curr_type = field.types[0]
+            if curr_type == GGUFValueType.STRING:
+                value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
+            elif curr_type in reader.gguf_scalar_to_np:
+                value = str(field.parts[-1][0])
+        else:
+            if field.types[0] == GGUFValueType.ARRAY:
+                curr_type = field.types[1]
+                if curr_type == GGUFValueType.STRING:
+                    render_element = min(5, total_elements)
+                    for element_pos in range(render_element):
+                        value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
+                elif curr_type in reader.gguf_scalar_to_np:
+                    render_element = min(7, total_elements)
+                    for element_pos in range(render_element):
+                        value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
+                value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
+        kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
+
+    kv_dump_table_header_map = [
+        {'key_name':'n',                'header_name':'POS',      'align':'right'},
+        {'key_name':'pretty_type',      'header_name':'TYPE',     'align':'left'},
+        {'key_name':'total_elements',   'header_name':'Count',    'align':'right'},
+        {'key_name':'field_name',       'header_name':'Key',      'align':'left'},
+        {'key_name':'value',            'header_name':'Value',    'align':'left'},
+    ]
+
+    markdown_content += markdown_table_with_alignment_support(kv_dump_table_header_map, kv_dump_table)
+
+    markdown_content += "\n"
+
+    if not args.no_tensors:
+        # Group tensors by their prefix and maintain order
+        tensor_prefix_order: list[str] = []
+        tensor_name_to_key: dict[str, int] = {}
+        tensor_groups: dict[str, list[ReaderTensor]] = {}
+        total_elements = sum(tensor.n_elements for tensor in reader.tensors)
+
+        # Parsing Tensors Record
+        for key, tensor in enumerate(reader.tensors):
+            tensor_components = tensor.name.split('.')
+
+            # Classify Tensor Group
+            tensor_group_name = "base"
+            if tensor_components[0] == 'blk':
+                tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
+
+            # Check if new Tensor Group
+            if tensor_group_name not in tensor_groups:
+                tensor_groups[tensor_group_name] = []
+                tensor_prefix_order.append(tensor_group_name)
+
+            # Record Tensor and Tensor Position
+            tensor_groups[tensor_group_name].append(tensor)
+            tensor_name_to_key[tensor.name] = key
+
+        # Tensors Mapping Dump
+        markdown_content += f'## Tensors Overview {element_count_rounded_notation(total_elements)} Elements\n\n'
+        markdown_content += f'Total number of elements in all tensors: {total_elements} Elements\n'
+        markdown_content += '\n'
+
+        for group in tensor_prefix_order:
+            tensors = tensor_groups[group]
+            group_elements = sum(tensor.n_elements for tensor in tensors)
+            markdown_content += f"- [{translate_tensor_name(group)} Tensor Group - {element_count_rounded_notation(group_elements)} Elements](#{group.replace('.', '_')})\n"
+
+        markdown_content += "\n"
+
+        for group in tensor_prefix_order:
+            tensors = tensor_groups[group]
+            group_elements = sum(tensor.n_elements for tensor in tensors)
+            group_percentage = group_elements / total_elements * 100
+            markdown_content += f"### <a name=\"{group.replace('.', '_')}\">{translate_tensor_name(group)} Tensor Group : {element_count_rounded_notation(group_elements)} Elements</a>\n\n"
+
+            # Precalculate column sizing for visual consistency
+            prettify_element_est_count_size: int = 1
+            prettify_element_count_size: int = 1
+            prettify_dimension_max_widths: dict[int, int] = {}
+            for tensor in tensors:
+                prettify_element_est_count_size = max(prettify_element_est_count_size, len(str(element_count_rounded_notation(tensor.n_elements))))
+                prettify_element_count_size = max(prettify_element_count_size, len(str(tensor.n_elements)))
+                for i, dimension_size in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape))):
+                    prettify_dimension_max_widths[i] = max(prettify_dimension_max_widths.get(i,1), len(str(dimension_size)))
+
+            # Generate Tensor Layer Table Content
+            tensor_dump_table: list[dict[str, str | int]] = []
+            for tensor in tensors:
+                human_friendly_name = translate_tensor_name(tensor.name.replace(".weight", ".(W)").replace(".bias", ".(B)"))
+                pretty_dimension = ' x '.join(f'{str(d):>{prettify_dimension_max_widths[i]}}' for i, d in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape))))
+                element_count_est = f"({element_count_rounded_notation(tensor.n_elements):>{prettify_element_est_count_size}})"
+                element_count_string = f"{element_count_est} {tensor.n_elements:>{prettify_element_count_size}}"
+                type_name_string = f"{tensor.tensor_type.name}"
+                tensor_dump_table.append({"t_id":tensor_name_to_key[tensor.name], "layer_name":tensor.name, "human_layer_name":human_friendly_name, "element_count":element_count_string, "pretty_dimension":pretty_dimension, "tensor_type":type_name_string})
+
+            tensor_dump_table_header_map = [
+                {'key_name':'t_id',             'header_name':'T_ID',                             'align':'right'},
+                {'key_name':'layer_name',       'header_name':'Tensor Layer Name',                'align':'left'},
+                {'key_name':'human_layer_name', 'header_name':'Human Friendly Tensor Layer Name', 'align':'left'},
+                {'key_name':'element_count',    'header_name':'Elements',                         'align':'left'},
+                {'key_name':'pretty_dimension', 'header_name':'Shape',                            'align':'left'},
+                {'key_name':'tensor_type',      'header_name':'Type',                             'align':'left'},
+            ]
+
+            markdown_content += markdown_table_with_alignment_support(tensor_dump_table_header_map, tensor_dump_table)
+
+            markdown_content += "\n"
+            markdown_content += f"- Total elements in {group}: ({element_count_rounded_notation(group_elements):>4}) {group_elements}\n"
+            markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n"
+            markdown_content += "\n\n"
+
+        print(markdown_content)  # noqa: NP100
+
+
 def main() -> None:
    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
    parser.add_argument("model",           type=str,            help="GGUF format model filename")
    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
+    parser.add_argument("--markdown",   action="store_true", help="Produce markdown output")
    parser.add_argument("--verbose",    action="store_true", help="increase output verbosity")

    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

-    if not args.json:
+    if not args.json and not args.markdown:
        logger.info(f'* Loading: {args.model}')

    reader = GGUFReader(args.model, 'r')

    if args.json:
        dump_metadata_json(reader, args)
+    elif args.markdown:
+        dump_markdown_metadata(reader, args)
    else:
        dump_metadata(reader, args)

@@ -174,6 +174,7 @@ extern "C" {
        LLAMA_POOLING_TYPE_NONE = 0,
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
+        LLAMA_POOLING_TYPE_LAST = 3,
    };

    enum llama_split_mode {
@@ -293,7 +294,6 @@ extern "C" {

        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-                                                        // (ignored if no pooling layer)

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -786,6 +786,10 @@ extern "C" {
    // Get the number of threads used for prompt and batch processing (multiple token).
    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);

+    // Set whether the model is in embeddings model or not
+    // If true, embeddings will be returned but logits will not
+    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+
    // Set whether to use causal attention or not
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -1,2 +1,2 @@
 -r ./requirements-convert-legacy-llama.txt
-torch~=2.1.1
+torch~=2.2.1
@@ -1,2 +1,2 @@
 -r ./requirements-convert-legacy-llama.txt
-torch~=2.1.1
+torch~=2.2.1
@@ -1,4 +1,4 @@
-numpy~=1.24.4
+numpy~=1.26.4
 sentencepiece~=0.2.0
 transformers>=4.40.1,<5.0.0
 gguf>=0.1.0
@@ -1,83 +1,143 @@
-import regex
-import ctypes
+import array
 import unicodedata
-
-
-class CoodepointFlags (ctypes.Structure):
-    _fields_ = [  # see definition in unicode.h
-        ("is_undefined",   ctypes.c_uint16, 1),
-        ("is_number",      ctypes.c_uint16, 1),  # regex: \p{N}
-        ("is_letter",      ctypes.c_uint16, 1),  # regex: \p{L}
-        ("is_separator",   ctypes.c_uint16, 1),  # regex: \p{Z}
-        ("is_accent_mark", ctypes.c_uint16, 1),  # regex: \p{M}
-        ("is_punctuation", ctypes.c_uint16, 1),  # regex: \p{P}
-        ("is_symbol",      ctypes.c_uint16, 1),  # regex: \p{S}
-        ("is_control",     ctypes.c_uint16, 1),  # regex: \p{C}
-    ]
-
-
-assert (ctypes.sizeof(CoodepointFlags) == 2)
+import requests


 MAX_CODEPOINTS = 0x110000

-regex_number      = regex.compile(r'\p{N}')
-regex_letter      = regex.compile(r'\p{L}')
-regex_separator   = regex.compile(r'\p{Z}')
-regex_accent_mark = regex.compile(r'\p{M}')
-regex_punctuation = regex.compile(r'\p{P}')
-regex_symbol      = regex.compile(r'\p{S}')
-regex_control     = regex.compile(r'\p{C}')
-regex_whitespace  = regex.compile(r'\s')
+UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"

-codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
+
+# see https://www.unicode.org/L2/L1999/UnicodeData.html
+def unicode_data_iter():
+    res = requests.get(UNICODE_DATA_URL)
+    res.raise_for_status()
+    data = res.content.decode()
+
+    prev = []
+
+    for line in data.splitlines():
+        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
+        line = line.split(";")
+
+        cpt = int(line[0], base=16)
+        assert cpt < MAX_CODEPOINTS
+
+        cpt_lower = int(line[-2] or "0", base=16)
+        assert cpt_lower < MAX_CODEPOINTS
+
+        cpt_upper = int(line[-3] or "0", base=16)
+        assert cpt_upper < MAX_CODEPOINTS
+
+        categ = line[2].strip()
+        assert len(categ) == 2
+
+        bidir = line[4].strip()
+        assert len(categ) == 2
+
+        name = line[1]
+        if name.endswith(", First>"):
+            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
+            continue
+        if name.endswith(", Last>"):
+            assert prev[1:] == (0, 0, categ, bidir)
+            for c in range(prev[0], cpt):
+                yield (c, cpt_lower, cpt_upper, categ, bidir)
+
+        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
+
+
+# see definition in unicode.h
+CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
+CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
+CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
+CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
+CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
+CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
+CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
+CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
+
+UNICODE_CATEGORY_TO_FLAG = {
+    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
+    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
+    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
+    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
+    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
+    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
+    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
+    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
+    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
+    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
+    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
+    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
+    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
+    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
+    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
+    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
+    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
+    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
+    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
+    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
+    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
+    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
+    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
+    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
+    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
+    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
+    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
+    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
+    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
+    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
+    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
+}
+
+
+codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
 table_whitespace = []
 table_lowercase = []
 table_uppercase = []
 table_nfd = []

-for codepoint in range(MAX_CODEPOINTS):
+for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
    # convert codepoint to unicode character
-    char = chr(codepoint)
+    char = chr(cpt)

-    # regex categories
-    flags = codepoint_flags[codepoint]
-    flags.is_number      = bool(regex_number.match(char))
-    flags.is_letter      = bool(regex_letter.match(char))
-    flags.is_separator   = bool(regex_separator.match(char))
-    flags.is_accent_mark = bool(regex_accent_mark.match(char))
-    flags.is_punctuation = bool(regex_punctuation.match(char))
-    flags.is_symbol      = bool(regex_symbol.match(char))
-    flags.is_control     = bool(regex_control.match(char))
-    flags.is_undefined   = bytes(flags)[0] == 0
-    assert (not flags.is_undefined)
-
-    # whitespaces
-    if bool(regex_whitespace.match(char)):
-        table_whitespace.append(codepoint)
+    # codepoint category flags
+    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]

    # lowercase conversion
-    lower = ord(char.lower()[0])
-    if codepoint != lower:
-        table_lowercase.append((codepoint, lower))
+    if cpt_lower:
+        table_lowercase.append((cpt, cpt_lower))

    # uppercase conversion
-    upper = ord(char.upper()[0])
-    if codepoint != upper:
-        table_uppercase.append((codepoint, upper))
+    if cpt_upper:
+        table_uppercase.append((cpt, cpt_upper))

    # NFD normalization
    norm = ord(unicodedata.normalize('NFD', char)[0])
-    if codepoint != norm:
-        table_nfd.append((codepoint, norm))
+    if cpt != norm:
+        table_nfd.append((cpt, norm))
+
+
+# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+table_whitespace.extend(range(0x0009, 0x000D + 1))
+table_whitespace.extend(range(0x2000, 0x200A + 1))
+table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
+
+
+# sort by codepoint
+table_whitespace.sort()
+table_lowercase.sort()
+table_uppercase.sort()
+table_nfd.sort()


 # group ranges with same flags
 ranges_flags = [(0, codepoint_flags[0])]  # start, flags
 for codepoint, flags in enumerate(codepoint_flags):
-    if bytes(flags) != bytes(ranges_flags[-1][1]):
+    if flags != ranges_flags[-1][1]:
        ranges_flags.append((codepoint, flags))
-ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
+ranges_flags.append((MAX_CODEPOINTS, 0x0000))


 # group ranges with same nfd
@@ -90,8 +150,8 @@ for codepoint, norm in table_nfd:
    ranges_nfd[-1] = (start, codepoint, norm)


-# Generate 'unicode-data.cpp'
-
+# Generate 'unicode-data.cpp':
+#   python ./scripts//gen-unicode-data.py > unicode-data.cpp

 def out(line=""):
    print(line, end='\n')  # noqa
@@ -110,12 +170,12 @@ out("""\

 out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
 for codepoint, flags in ranges_flags:
-    flags = int.from_bytes(bytes(flags), "little")
    out("{0x%06X, 0x%04X}," % (codepoint, flags))
 out("};\n")

 out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
-out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
+for codepoint in table_whitespace:
+    out("0x%06X," % codepoint)
 out("};\n")

 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
@@ -1 +1 @@
-2aae01fd9b8f9399f343cf18f46f38996ef52e2c
+5653a195935ea3ac54652644c9daf154dbc1571b
@@ -43,8 +43,10 @@
 // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
 //     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

+#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wpedantic"
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif

 #include "sgemm.h"
 #include "ggml-impl.h"
@@ -1063,6 +1063,33 @@ struct test_sqr : public test_case {
    }
 };

+// GGML_OP_SQRT
+struct test_sqrt : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqrt(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 10})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * out = ggml_sqrt(ctx, a);
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // fill with positive values
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 100.0f);
+        }
+    }
+};
+
 // GGML_OP_CLAMP
 struct test_clamp : public test_case {
    const ggml_type type;
@@ -2200,6 +2227,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    test_cases.emplace_back(new test_sqr());
+    test_cases.emplace_back(new test_sqrt());
    test_cases.emplace_back(new test_clamp());

    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10,  1,  1}, 5));
@@ -7,11 +7,16 @@
 #include "ggml.h"
 #include "llama.h"
 #include "grammar-parser.h"
+#include "json-schema-to-grammar.h"
 #include "unicode.h"
 #include <cassert>
 #include <string>
 #include <vector>

+using json = nlohmann::ordered_json;
+
+//#define INCLUDE_FAILING_TESTS 1
+
 static llama_grammar* build_grammar(const std::string & grammar_str) {
    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());

@@ -65,8 +70,8 @@ static bool match_string(const std::string & input, llama_grammar* grammar) {
    return false;
 }

-static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
-    fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str());
+static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
    fflush(stderr);

    auto grammar = build_grammar(grammar_str);
@@ -85,6 +90,23 @@ static void test_grammar(const std::string & test_desc, const std::string & gram

        if (!matched) {
            fprintf(stderr, "❌ (failed to match)\n");
+
+            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
+            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
+            FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
+            if (grammar_file) {
+                fprintf(grammar_file, "%s", grammar_str.c_str());
+                fclose(grammar_file);
+            }
+
+            // DEBUG: Write the test string to test-grammar-integration.string.txt
+            FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
+            if (string_file) {
+                fprintf(string_file, "%s", test_string.c_str());
+                fclose(string_file);
+            }
+
+            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
        } else {
            fprintf(stdout, "✅︎\n");
        }
@@ -118,6 +140,12 @@ static void test_grammar(const std::string & test_desc, const std::string & gram
    // Clean up allocated memory
    llama_grammar_free(grammar);
 }
+static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
+}
+static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings);
+}

 static void test_simple_grammar() {
    // Test case for a simple grammar
@@ -400,10 +428,11 @@ static void test_quantifiers() {
 static void test_failure_missing_root() {
    fprintf(stderr, "⚫ Testing missing root node:\n");
    // Test case for a grammar that is missing a root rule
-    const std::string grammar_str = R"""(rot ::= expr
-expr ::= term ("+" term)*
-term ::= number
-number ::= [0-9]+)""";
+    const std::string grammar_str = R"""(
+        rot ::= expr
+        expr ::= term ("+" term)*
+        term ::= number
+        number ::= [0-9]+)""";

    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());

@@ -420,10 +449,10 @@ static void test_failure_missing_reference() {

    // Test case for a grammar that is missing a referenced rule
    const std::string grammar_str =
-R"""(root ::= expr
-expr ::= term ("+" term)*
-term ::= numero
-number ::= [0-9]+)""";
+        R"""(root ::= expr
+        expr ::= term ("+" term)*
+        term ::= numero
+        number ::= [0-9]+)""";

    fprintf(stderr, "    Expected error:  ");

@@ -445,29 +474,558 @@ static void test_failure_left_recursion() {

    // Test more complicated left recursion detection
    const std::string medium_str = R"""(
-root ::= asdf
-asdf ::= "a" | asdf "a"
-)""";
+        root ::= asdf
+        asdf ::= "a" | asdf "a"
+        )""";
    assert(test_build_grammar_fails(medium_str));

    // Test even more complicated left recursion detection
    const std::string hard_str = R"""(
-root ::= asdf
-asdf ::= "a" | foo "b"
-foo ::= "c" | asdf "d" | "e")""";
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | asdf "d" | "e")""";
    assert(test_build_grammar_fails(hard_str));

    // Test yet even more complicated left recursion detection
    const std::string hardest_str = R"""(
-root ::= asdf
-asdf ::= "a" | foo "b"
-foo ::= "c" | empty asdf "d" | "e"
-empty ::= "blah" | )""";
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | empty asdf "d" | "e"
+        empty ::= "blah" | )""";
    assert(test_build_grammar_fails(hardest_str));

    fprintf(stderr, "  ✅︎ Passed\n");
 }

+static void test_json_schema() {
+    // Note that this is similar to the regular grammar tests,
+    //  but we convert each json schema to a grammar before parsing.
+    // Otherwise, this test structure is the same.
+
+    test_schema(
+        "empty schema (object)",
+        // Schema
+        R"""(
+            {}
+        )""",
+        // Passing strings
+        {
+            "{}",
+            R"""({"foo": "bar"})""",
+        },
+        // Failing strings
+        {
+            "",
+            "[]",
+            "null",
+            "\"\"",
+            "true",
+        }
+    );
+
+    test_schema(
+        "exotic formats (list)",
+        // Schema
+        R"""(
+            {
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+            }
+        )""",
+        // Passing strings
+        {
+            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
+            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+        },
+        // Failing strings
+        {
+            R"""(["foo", "bar"])""",
+            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
+        }
+    );
+
+    test_schema(
+        "string",
+        // Schema
+        R"""(
+            {
+                "type": "string"
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+            "\"bar\"",
+            "\"\"",
+        },
+        // Failing strings
+        {
+            "{}",
+            "\"foo\": \"bar\"",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 1",
+        // Schema
+        R"""(
+            {
+                "type": "string",
+                "minLength": 1
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+            "\"bar\"",
+        },
+        // Failing strings
+        {
+            "\"\"",
+            "{}",
+            "\"foo\": \"bar\"",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 3",
+        // Schema
+        R"""(
+            {
+                "type": "string",
+                "minLength": 3
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+            "\"bar\"",
+            "\"foobar\"",
+        },
+        // Failing strings
+        {
+            "\"\"",
+            "\"f\"",
+            "\"fo\"",
+        }
+    );
+
+    test_schema(
+        "string w/ max length",
+        // Schema
+        R"""(
+            {
+                "type": "string",
+                "maxLength": 3
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+            "\"bar\"",
+            "\"\"",
+            "\"f\"",
+            "\"fo\"",
+        },
+        // Failing strings
+        {
+            "\"foobar\"",
+        }
+    );
+
+    test_schema(
+        "string w/ min & max length",
+        // Schema
+        R"""(
+            {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 4
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+            "\"bar\"",
+            "\"f\"",
+            "\"barf\"",
+        },
+        // Failing strings
+        {
+            "\"\"",
+            "\"barfo\"",
+            "\"foobar\"",
+        }
+    );
+
+    test_schema(
+        "boolean",
+        // Schema
+        R"""(
+            {
+                "type": "boolean"
+            }
+        )""",
+        // Passing strings
+        {
+            "true",
+            "false",
+        },
+        // Failing strings
+        {
+            "\"\"",
+            "\"true\"",
+            "True",
+            "FALSE",
+        }
+    );
+
+    test_schema(
+        "integer",
+        // Schema
+        R"""(
+            {
+                "type": "integer"
+            }
+        )""",
+        // Passing strings
+        {
+            "0",
+            "12345",
+            "1234567890123456"
+        },
+        // Failing strings
+        {
+            "",
+            "01",
+            "007",
+            "12345678901234567"
+        }
+    );
+
+    test_schema(
+        "string const",
+        // Schema
+        R"""(
+            {
+                "const": "foo"
+            }
+        )""",
+        // Passing strings
+        {
+            "\"foo\"",
+        },
+        // Failing strings
+        {
+            "foo",
+            "\"bar\"",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""(
+            {
+                "const": true
+            }
+        )""",
+        // Passing strings
+        {
+            "true",
+        },
+        // Failing strings
+        {
+            "",
+            "foo",
+            "\"true\"",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""(
+            {
+                "enum": ["red", "amber", "green", null, 42, ["foo"]]
+            }
+        )""",
+        // Passing strings
+        {
+            "\"red\"",
+            "null",
+            "42",
+            "[\"foo\"]",
+        },
+        // Failing strings
+        {
+            "",
+            "420",
+            "true",
+            "foo",
+        }
+    );
+
+
+    test_schema(
+        "min+max items",
+        // Schema
+        R"""(
+            {
+                "items": {
+                    "type": ["number", "integer"]
+                },
+                "minItems": 3,
+                "maxItems": 5
+            }
+        )""",
+        // Passing strings
+        {
+            "[1, 2, 3]",
+            "[1, 2, 3, 4]",
+            "[1, 2, 3, 4, 5]",
+        },
+        // Failing strings
+        {
+            "[1, 2]",
+            "[1, 2, 3, 4, 5, 6]",
+            "1"
+        }
+    );
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties",
+        // Schema
+        R"""(
+            {
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            }
+            }
+        )""",
+        // Passing strings
+        {
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+            // "By default, providing additional properties is valid"
+#ifdef INCLUDE_FAILING_TESTS
+            // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+#endif
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
+            // Reorder properties
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+        }
+    );
+
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties, additionalProperties: true",
+        // Schema
+        R"""(
+            {
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": true
+            }
+        )""",
+        // Passing strings
+        {
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+#ifdef INCLUDE_FAILING_TESTS
+            // TODO: Following line should pass and doesn't
+            R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            // TODO: Following line should pass and doesn't
+            R"""({ "street_name": "Pennsylvania" })""",
+            // TODO: Following line should pass and doesn't
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By default, providing additional properties is valid"
+            // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+#endif
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
+        }
+    );
+
+    // Additional properties: false
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""(
+            {
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+            }
+        )""",
+        // Passing strings
+        {
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_type":"Avenue"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+#ifdef INCLUDE_FAILING_TESTS
+            // TODO: Spaces should be permitted around enum values, but currently they fail to pass.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+#endif
+        },
+        // Failing strings
+        {
+            // Reorder properties
+            R"""({ "street_type": "Avenue", "number": 1600 })""",
+            // Add "direction"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
+        }
+    );
+
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""(
+            {
+                "properties": {
+                    "b": {"type": "string"},
+                    "a": {"type": "string"},
+                    "d": {"type": "string"},
+                    "c": {"type": "string"}
+                },
+                "required": ["a", "b"],
+                "additionalProperties": false
+            }
+        )""",
+        // Passing strings
+        {
+            R"""({"b": "foo", "a": "bar"})""",
+            R"""({"b":"foo","a":"bar","d":"qux"})""",
+            R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
+        },
+        // Failing strings
+        {
+            R"""({"a": "foo", "b": "bar"})""",
+            R"""({"b": "bar"})""",
+            R"""({"a": "foo", "c": "baz"})""",
+            R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
+        }
+    );
+
+    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
+    test_schema(
+        "required props",
+        // Schema
+        R"""(
+            {
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "$id": "https://example.com/product.schema.json",
+            "title": "Product",
+            "description": "A product from Acme's catalog",
+            "type": "object",
+            "properties": {
+                "productId": {
+                "description": "The unique identifier for a product",
+                "type": "integer"
+                },
+                "productName": {
+                "description": "Name of the product",
+                "type": "string"
+                },
+                "price": {
+                "description": "The price of the product",
+                "type": "number",
+                "exclusiveMinimum": 0
+                },
+                "tags": {
+                "description": "Tags for the product",
+                "type": "array",
+                "items": {
+                    "type": "string"
+                },
+                "minItems": 1,
+                "uniqueItems": true
+                },
+                "dimensions": {
+                "type": "object",
+                "properties": {
+                    "length": {
+                    "type": "number"
+                    },
+                    "width": {
+                    "type": "number"
+                    },
+                    "height": {
+                    "type": "number"
+                    }
+                },
+                "required": [ "length", "width", "height" ]
+                }
+            },
+            "required": [ "productId", "productName", "price" ]
+            }
+        )""",
+        // Passing strings
+        {
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
+        },
+        // Failing strings
+        {
+            R"""({})""", // Missing all required properties
+            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
+            // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
+            //  Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
+            // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
+            R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
+            R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
+            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
+            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
+        }
+    );
+}
+
 int main() {
    fprintf(stdout, "Running grammar integration tests...\n");
    test_simple_grammar();
@@ -477,6 +1035,7 @@ int main() {
    test_failure_missing_root();
    test_failure_missing_reference();
    test_failure_left_recursion();
+    test_json_schema();
    fprintf(stdout, "All tests passed.\n");
    return 0;
 }
@@ -11,13 +11,15 @@ import logging
 import argparse
 import subprocess
 import random
+import unicodedata

 from typing import Callable, Iterator

 import cffi
 from transformers import AutoTokenizer

-logger = logging.getLogger("test-tokenizer-random-bpe")
+
+logger = logging.getLogger("test-tokenizer-random")


 class LibLlama:
@@ -155,9 +157,14 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
        'Cửa Việt',   # llama-3, ignore_merges = true
        '<s>a',       # Phi-3 fail
        '<unk><|endoftext|><s>',  # Phi-3 fail
-        'a\na',       # TODO: Bert fail
-        'a </s> b',   # rstrip phi-3
-        'a <mask> b', # lstrip jina-v2
+        'a\na',            # bert fail
+        '"`',              # falcon
+        ' \u2e4e',         # falcon
+        'a\xa0\xa0\x00b',  # jina-v2-es
+        'one <mask>',      # jina-v2-es  <mask> lstrip=true
+        'a </s> b',        # rstrip phi-3
+        'a <mask> b',      # lstrip jina-v2
+        '\xa0aC',          # deepseek
    ]


@@ -189,17 +196,23 @@ def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
    for m in range(iterations):
        rand.seed(m)
        words = rand.choices(all_tokens, k=500)
-        if words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
+        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
                words.pop(0)
            if tokenizer.add_bos_token:  # drop all starting BOS
                words.pop(0)
+        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
+            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
+                words.pop(-1)
+            if tokenizer.add_bos_token:  # drop all trailing EOS
+                words.pop(-1)
        yield "".join(words)


 def generator_random_chars(iterations=100) -> Iterator[str]:
    """Brute force random text with simple characters"""

+    NUM_WORDS = 400
    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
    CHARS = list(sorted(set("""
        ABCDEFGHIJKLMNOPQRSTUVWXYZ
@@ -213,12 +226,50 @@ def generator_random_chars(iterations=100) -> Iterator[str]:
    for m in range(iterations):
        rand.seed(m)
        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
+        for _ in range(NUM_WORDS):
            k = rand.randint(1, 7)
            word = rand.choices(CHARS, k=k)
-            space = rand.choice(WHITESPACES)
-            text.append("".join(word) + space)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_unicodes() -> Iterator[str]:
+    """Iterate unicode characters"""
+
+    MAX_CODEPOINTS = 0x30000  # 0x110000
+
+    def _valid(cpt):
+        if cpt >= 0x30000:  # unassigned and supplementary
+            return False
+        if 0x00D800 <= cpt <= 0x00F8FF:  # Surrogates
+            return False
+        if unicodedata.category(chr(cpt)) == "Cn":
+            return False
+        return True
+
+    characters = [chr(cpt) for cpt in range(1, MAX_CODEPOINTS) if _valid(cpt)]
+
+    yield from characters
+
+
+def generator_random_unicodes(iterations=100) -> Iterator[str]:
+    """Brute force random text with unicode characters"""
+
+    NUM_WORDS = 200
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+
+    characters = list(generator_unicodes())
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(characters, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
        yield "".join(text)


@@ -256,25 +307,7 @@ def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[s
        yield "".join(text)


-def generator_random_bytes(iterations=100) -> Iterator[str]:
-    """Brute force random bytes"""
-
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 8)
-            word = [chr(r) for r in rand.randbytes(k) if r]
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]):
+def compare_tokenizers(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]):

    def find_first_mismatch(ids1: list[int], ids2: list[int]):
        for i, (a, b) in enumerate(zip(ids1, ids2)):
@@ -284,20 +317,34 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g
            return -1
        return min(len(ids1), len(ids2))

-    t0 = time.perf_counter()
+    t_tokenizer1 = 0
+    t_tokenizer2 = 0
+    t_start = time.perf_counter()
+    num_errors = 10
+
    logger.info("%s: %s" % (generator.__name__, "ini"))
    for text in generator:
+        # print(repr(text), hex(ord(text[0])), text.encode())
+        t0 = time.perf_counter()
        ids1 = func_tokenize1(text)
+        t1 = time.perf_counter()
        ids2 = func_tokenize2(text)
+        t2 = time.perf_counter()
+        t_tokenizer1 += t1 - t0
+        t_tokenizer2 += t2 - t1
        if ids1 != ids2:
            i = find_first_mismatch(ids1, ids2)
            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
-            logger.info(" TokenIDs: " + str(ids1))
-            logger.info(" Expected: " + str(ids2))
-            raise Exception()
-    t1 = time.perf_counter()
-    logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
+            logger.error(" TokenIDs: " + str(ids1))
+            logger.error(" Expected: " + str(ids2))
+            # raise Exception()
+            num_errors += 1
+            if num_errors > 10:
+                break
+
+    t_total = time.perf_counter() - t_start
+    logger.info("%s: end,  tok1: %.3f  tok2: %.3f  total: %.3f" % (generator.__name__, t_tokenizer1, t_tokenizer2, t_total))


 def main(argv: list[str] = None):
@@ -307,7 +354,8 @@ def main(argv: list[str] = None):
    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    args = parser.parse_args(argv)

-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
+    logger.info(f"VOCABFILE: '{args.vocab_file}'")

    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
@@ -321,18 +369,22 @@ def main(argv: list[str] = None):
    ids = func_tokenize2("a")
    assert 1 <= len(ids) <= 3
    add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
+    add_eos_token = len(ids) > 1 and tokenizer.eos_token_id == ids[-1]
    tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
+    tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", add_eos_token)

    vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
-    # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
+
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text())
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_unicodes())
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_unicodes(10_000))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
+    compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))

    model.free()

@@ -340,20 +392,40 @@ def main(argv: list[str] = None):
 if __name__ == "__main__":
    # main()

+    logging.basicConfig(
+        level    = logging.DEBUG,
+        format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
+        datefmt  = "%Y-%m-%d %H:%M:%S",
+        filename = logger.name + ".log",
+        filemode = "a"
+    )
+
    path_tokenizers   = "./models/tokenizers/"
    path_vocab_format = "./models/ggml-vocab-%s.gguf"

    # import os
    # tokenizers = os.listdir(path_tokenizers)
    tokenizers = [
-        "llama-spm",   # SPM
-        "phi-3",       # SPM
-        "jina-v2-en",  # WPM
-        "bert-bge",    # WPM
+        # "llama-spm",   # SPM
+        # "phi-3",       # SPM
+        # "bert-bge",    # WPM
+        # "jina-v2-en",  # WPM
+        "gpt-2",          # BPE
+        "llama-bpe",      # BPE
+        "falcon",         # BPE
+        "starcoder",      # BPE
+        "jina-v2-es",     # BPE
+        "jina-v2-de",     # BPE
+        "jina-v2-code",   # BPE
+        "smaug-bpe",      # BPE
+        "phi-2",          # BPE
+        "deepseek-coder", # BPE
+        "deepseek-llm",   # BPE
    ]

    for tokenizer in tokenizers:
-        print("\n" + "=" * 50 + "\n" + tokenizer + "\n")  # noqa
+        logger.info("=" * 50)
+        logger.info(f"TOKENIZER: '{tokenizer}'")
        vocab_file = path_vocab_format % tokenizer
        dir_tokenizer = path_tokenizers + "/" + tokenizer
        main([vocab_file, dir_tokenizer, "--verbose"])
@@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        assert(offset_end <= cpts.size());
        start = offset_end;

-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };

        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -253,18 +254,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        };

        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
+            const uint32_t cpt = _get_cpt(pos);
            const auto flags = _get_flags(pos);

            // regex: 's|'t|'re|'ve|'m|'ll|'d
            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = _get_cpt(pos+1);
+                uint32_t cpt_next = _get_cpt(pos+1);
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = _get_cpt(pos+2);
+                    uint32_t cpt_next_next = _get_cpt(pos+2);
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
            }

            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
@@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        assert(offset_end <= cpts.size());
        start = offset_end;

-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };

        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -371,18 +373,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        };

        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
+            const uint32_t cpt = _get_cpt(pos);
            const auto flags = _get_flags(pos);

            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+                uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+                    uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -424,7 +426,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
-                char32_t cpt2 = _get_cpt(pos);
+                uint32_t cpt2 = _get_cpt(pos);
                while (cpt2 == '\r' || cpt2 == '\n') {
                    cpt2 = _get_cpt(++pos);
                }
@@ -435,7 +437,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            size_t num_whitespaces = 0;
            size_t last_end_r_or_n = 0;
            while (_get_flags(pos+num_whitespaces).is_whitespace) {
-                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
+                uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
                if (cpt2 == '\r' || cpt2 == '\n') {
                    last_end_r_or_n = pos + num_whitespaces + 1;
                }
@@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            }

            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
@@ -594,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c

 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
+    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(unicode_cpt_from_utf8(utf8, offset));
@@ -626,7 +629,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
    return map.at(utf8);
 }

-char32_t unicode_tolower(char32_t cp) {
+uint32_t unicode_tolower(uint32_t cp) {
    auto it = unicode_map_lowercase.find(cp);
    return it == unicode_map_lowercase.end() ? cp : it->second;
 }
@@ -679,10 +682,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                continue;
            }

-            const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
+            const auto flags = unicode_cpt_flags(cpts[i]);

-            if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
-                text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
+            if (flags.is_whitespace) {
+                //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
+                //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
+                text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
+            } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
+                text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
            } else {
                text_collapsed[i] = (char) 0xD0; // fallback
            }
@@ -766,9 +773,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
            } else {
                // no unicode category used, we can use std::wregex directly
-                const std::wstring wtext       = unicode_wstring_from_utf8(text);
                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);

+                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
+                std::wstring wtext(cpts.begin(), cpts.end());
+                for (size_t i = 0; i < wtext.size(); ++i) {
+                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
+                        wtext[i] = 0x0B;
+                    }
+                }
+
                //printf("text: %s\n", text.c_str());
                //printf("regex_expr: %s\n", regex_expr.c_str());
                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
@@ -58,6 +58,6 @@ codepoint_flags unicode_cpt_flags(const std::string & utf8);
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);

-char32_t unicode_tolower(char32_t cp);
+uint32_t unicode_tolower(uint32_t cp);

 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
@@ -0,0 +1,12 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+void main() {
+    if (gl_GlobalInvocationID.x >= p.ne) {
+        return;
+    }
+
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+}
@@ -0,0 +1,71 @@
+#version 450
+
+#include "types.comp"
+
+#define BLOCK_SIZE 1024
+#define ASC 0
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1)          buffer D {int data_d[];};
+
+layout (push_constant) uniform parameter {
+    uint ncols;
+    uint ncols_pad;
+    uint order;
+} p;
+
+shared int dst_row[BLOCK_SIZE];
+
+void swap(uint idx0, uint idx1) {
+    int tmp = dst_row[idx0];
+    dst_row[idx0] = dst_row[idx1];
+    dst_row[idx1] = tmp;
+}
+
+void main() {
+    // bitonic sort
+    const int col = int(gl_LocalInvocationID.x);
+    const uint row = gl_WorkGroupID.y;
+
+    if (col >= p.ncols_pad) {
+        return;
+    }
+
+    const uint row_offset = row * p.ncols;
+
+    // initialize indices
+    dst_row[col] = col;
+    barrier();
+
+    for (uint k = 2; k <= p.ncols_pad; k *= 2) {
+        for (uint j = k / 2; j > 0; j /= 2) {
+            const uint ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= p.ncols ||
+                        (dst_row[ixj] < p.ncols && (p.order == ASC ?
+                            data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] :
+                            data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]]))
+                    ) {
+                        swap(col, ixj);
+                    }
+                } else {
+                    if (dst_row[ixj] >= p.ncols ||
+                        (dst_row[col] < p.ncols && (p.order == ASC ?
+                            data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] :
+                            data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]]))
+                    ) {
+                        swap(col, ixj);
+                    }
+                }
+            }
+            barrier();
+        }
+    }
+
+    if (col < p.ncols) {
+        data_d[row_offset + col] = dst_row[col];
+    }
+}
@@ -0,0 +1,13 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+void main() {
+    if (gl_GlobalInvocationID.x >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+}
@@ -0,0 +1,16 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+void main() {
+    if (gl_GlobalInvocationID.x >= p.ne) {
+        return;
+    }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+#else
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
+#endif
+}
@@ -0,0 +1,20 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x * 16;
+
+    if (i >= p.nel) {
+        return;
+    }
+
+    [[unroll]] for (uint l = 0; l < 16; l++) {
+        data_b[i + l] = D_TYPE(data_a[i + l]);
+    }
+}
@@ -0,0 +1,60 @@
+#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#endif
+
+#if defined(DATA_A_F32)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
+}
+#endif
+
+#if defined(DATA_A_F16)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
+}
+#endif
+
+#if defined(DATA_A_Q4_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
+}
+#endif
+
+#if defined(DATA_A_Q4_1)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    const float m = float(data_a[a_offset + ib].m);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2(vui & 0xF, vui >> 4) * d + m;
+}
+#endif
+
+#if defined(DATA_A_Q5_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
+    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
+}
+#endif
+
+#if defined(DATA_A_Q5_1)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    const float m = float(data_a[a_offset + ib].m);
+    const uint uint_qh = data_a[a_offset + ib].qh;
+    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
+}
+#endif
@@ -0,0 +1,13 @@
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint M;
+    uint K;
+    uint stride_a;
+    uint stride_b;
+    uint nel;
+} p;
+
+#include "types.comp"
@@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = gl_WorkGroupID.x * 256 + wgy;
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint ip = tid / 32;
+        const uint il = tid - 32 * ip;
+        const uint is = 8 * ip + il / 16;
+
+        const uint y_idx = i * QUANT_K + 128 * ip + il;
+
+        const uint ql_idx = 32 * ip + il;
+        const uint8_t qs = data_a[i].qs[32 * ip + il];
+
+        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+        data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
+        data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
+        data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
+        data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4));
+    }
+}
@@ -0,0 +1,42 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint r = gl_LocalInvocationID.x / 4;
+        const uint tid = r / 2;
+        const uint is0 = r % 2;
+        const uint l0 = 16 * is0 + 4 * (gl_LocalInvocationID.x % 4);
+        const uint n = tid / 4;
+        const uint j = tid - 4*n;
+
+        const uint8_t m = uint8_t(1 << (4*n + j));
+        const uint is = 8*n + 2*j + is0;
+        const uint shift = 2*j;
+
+        const int8_t us = int8_t(is <  4 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+8] >> 0) & 3) << 4) :
+                                 is <  8 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+4] >> 2) & 3) << 4) :
+                                 is < 12 ? (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is+0] >> 4) & 3) << 4) :
+                                           (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is-4] >> 6) & 3) << 4));
+        const FLOAT_TYPE d_all = FLOAT_TYPE(data_a[i].d);
+        const FLOAT_TYPE dl    = d_all * FLOAT_TYPE(us - 32);
+
+        const uint y_idx = i * QUANT_K + 128 * n + 32 * j;
+        const uint qs_idx = 32*n;
+
+        for (uint l = l0; l < l0 + 4; ++l) {
+            data_b[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((data_a[i].qs[qs_idx + l] >> shift) & 3) - (((data_a[i].hmask[l] & m) != 0) ? 0 : 4)));
+        }
+    }
+}
@@ -0,0 +1,32 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const float dm = -8.0f * d;
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
+        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
+    }
+}
@@ -0,0 +1,32 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const float m = float(data_a[ib].m);
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
+        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + m);
+    }
+}
@@ -0,0 +1,56 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = gl_WorkGroupID.x * 256 + wgy;
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint il = tid / 8;
+        const uint ir = tid % 8;
+        const uint is = 2 * il;
+        const uint n = 4;
+
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+
+        const uint y_idx = i * QUANT_K + 64 * il + n * ir;
+        const uint qs_idx = 32*il + n * ir;
+
+        uint8_t sc;
+        uint8_t m;
+        if (is < 4) {
+            sc = uint8_t(data_a[i].scales[is] & 63);
+            m  = uint8_t(data_a[i].scales[is + 4] & 63);
+        } else {
+            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
+            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
+        }
+        const FLOAT_TYPE d1 = dall * sc;
+        const FLOAT_TYPE m1 = dmin * m;
+
+        if (is < 4) {
+            sc = uint8_t(data_a[i].scales[is + 1] & 63);
+            m  = uint8_t(data_a[i].scales[is + 5] & 63);
+        } else {
+            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
+            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
+        }
+        const FLOAT_TYPE d2 = dall * sc;
+        const FLOAT_TYPE m2 = dmin * m;
+
+        [[unroll]] for (uint l = 0; l < n; ++l) {
+            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
+            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
+        }
+    }
+}
@@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q5_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        const uint iqs = q_idx + l;
+        const uint vui = uint(data_a[ib].qs[iqs]);
+        data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f));
+        data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f));
+    }
+}
@@ -0,0 +1,35 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q5_1 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const float m = float(data_a[ib].m);
+    const uint qh = data_a[ib].qh;
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        const uint iqs = q_idx + l;
+        const uint vui = uint(data_a[ib].qs[iqs]);
+        data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m);
+        data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10))) + m);
+    }
+}
@@ -0,0 +1,58 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = gl_WorkGroupID.x * 256 + wgy;
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint il = tid / 16;
+        const uint ir = tid % 16;
+        const uint is = 2 * il;
+
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+
+        const uint y_idx = i * QUANT_K + 64 * il + 2 * ir;
+        const uint qs_idx = 32*il + 2 * ir;
+        const uint qh_idx = 2 * ir;
+
+        uint8_t sc;
+        uint8_t m;
+        if (is < 4) {
+            sc = uint8_t(data_a[i].scales[is] & 63);
+            m  = uint8_t(data_a[i].scales[is + 4] & 63);
+        } else {
+            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
+            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
+        }
+        const FLOAT_TYPE d1 = dall * sc;
+        const FLOAT_TYPE m1 = dmin * m;
+
+        if (is < 4) {
+            sc = uint8_t(data_a[i].scales[is + 1] & 63);
+            m  = uint8_t(data_a[i].scales[is + 5] & 63);
+        } else {
+            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
+            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
+        }
+        const FLOAT_TYPE d2 = dall * sc;
+        const FLOAT_TYPE m2 = dmin * m;
+
+        const uint8_t hm1 = uint8_t(1 << (2 * il    ));
+        const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
+        data_b[y_idx     ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx    ] & 0xF) + (((data_a[i].qh[qh_idx    ] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx +  1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx    ]  >> 4) + (((data_a[i].qh[qh_idx    ] & hm2) != 0) ? 16 : 0)) - m2);
+        data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1]  >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
+    }
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Clint Herron	b5a5f34efa	Removing extra blank lines that were breaking Lint. (#8067 )	2024-06-22 14:28:18 -04:00
Xuan Son Nguyen	3e58b0ee35	cvector: fix CI + correct help message (#8064 ) * cvector: fix CI + correct help message * also correct --pca-iter	2024-06-22 18:11:30 +02:00
HatsuneMikuUwU33	adf480c3ab	cvector-generator: Moe Moe Fixie-Fixie for Lots of Formats~! ♡(ᐢ ᴥ ᐢ)♡ (#8052 ) * Update negative.txt * Update positive.txt * Update cvector-generator.cpp * Update cvector-generator.cpp	2024-06-22 17:19:37 +02:00
0xspringtime	3aa184a8c7	convert-hf : change assert to exception (#8015 )	2024-06-22 15:37:41 +02:00
ddh0	5b48cd53a8	Update llama-quantize ppl/file size output from LLaMA-v1 to Llama-3 values (#8058 ) Uses the values computed by @JohannesGaessler in PR #7413	2024-06-22 15:16:10 +02:00
Clint Herron	c5a8d4b749	JSON Schema to GBNF integration tests (#7790 ) * Adding simple bare-bones test for end-to-end integration test for json validation against auto-generated JSON-schema grammars. * Adding additional examples as documented in #7789 . Also adding the ability to automatically output improperly failing grammars to debug output files so they can more easily be examined in the gbnf-validator program. * Uncommenting formerly commented tests so that they fail for others who are attempting to reproduce the bugs. * Merging improved schema test methods added by @ochafik in #7797 * Adding #define to temporarily remove failing tests so that this PR can pass CI, but still be useful for other PRs that want to leverage the framework. * Fixing nits from ochafik. Removing escape slashes, adding additional failing cases, fixing some other strings. * Fixing grammar indentation to be consistent throughout file.	2024-06-21 23:18:36 -04:00
k.h.lai	557b653dc9	vulkan: detect multiple devices by deviceUUID instead of deviceID (#8022 ) * vulkan: detect multiple devices by deviceUUID instead of deviceID * vulkan: remove unneeded variables * vulkan: fix id query	2024-06-21 10:28:20 +02:00
Eve	7d5e8777ae	ggml : AVX IQ quants (#7845 ) * initial iq4_xs * fix ci * iq4_nl * iq1_m * iq1_s * iq2_xxs * iq3_xxs * iq2_s * iq2_xs * iq3_s before sllv * iq3_s * iq3_s small fix * iq3_s sllv can be safely replaced with sse multiply	2024-06-21 08:57:36 +03:00
Georgi Gerganov	a927b0f3dd	llama : optimize long word tokenization with WPM (#8034 ) ggml-ci	2024-06-21 08:51:28 +03:00
Douglas Hanley	80ea089d77	llama : allow pooled embeddings on any model (#7477 ) * create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling	2024-06-21 08:38:22 +03:00
Shuichi Tsutsumi	0e64591e82	swiftui : enable stream updating (#7754 )	2024-06-21 08:30:58 +03:00
Hamdoud Hakem	b1ef562bc1	requirements : Bump torch and numpy for python3.12 (#8041 )	2024-06-20 22:01:15 +02:00
Hamdoud Hakem	17b291a6a5	convert-hf : Fix the encoding in the convert-hf-to-gguf-update.py (#8040 )	2024-06-20 21:59:59 +02:00
Johannes Gäßler	abd894ad96	common: fix warning (#8036 ) * common: fix warning * Update common/common.cpp Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-06-20 16:40:13 +02:00
luoyu-intel	de391e4c80	[SYCL] Fix windows build and inference (#8003 ) * add sycl preset * fix debug link error. fix windows crash * update README	2024-06-20 21:19:05 +08:00
Johannes Gäßler	d50f8897a7	CUDA: stream-k decomposition for MMQ (#8018 ) * CUDA: stream-k decomposition for MMQ * fix undefined memory reads for small matrices	2024-06-20 14:39:21 +02:00
Michael de Gans	2075a66a96	metal : fix `ggml_metal_supports_op` for BF16 (#8021 ) Currently the Metal backend does not support BF16. `ggml_metal_supports_op` was returning true in these cases, leading to a crash with models converted with `--leave-output-tensor`. This commit checks if the first few sources types are BF16 and returns false if that's the case.	2024-06-20 08:32:01 +03:00
sasha0552	ba58993152	server : fix smart slot selection (#8020 )	2024-06-20 09:57:10 +10:00
Michael de Gans	a7854743c5	un-ignore `build-info.cmake` and `build-info.sh` (#7996 ) * un-ignore `build-info.cmake` and `build-info.sh` I am assuming that ignoring them was unintentional. If they are ignored, some tools, like cargo, will consider the files inexistent, even if they're comitted, for the purpose of publishing. This leads to the build failing in such cases. * un-ignore `build-info.cpp.in` For the same reason as the previous two files. * Reorganize `.gitignore` * Add exceptions for files mentioned by @slaren I did leave .clang-tidy since it was explicitly ignored before. * Add comments for organization * Sort some lines for pretty * Test with `make` and `cmake` builds to ensure no build artifacts might be comitted * Remove `.clang-tidy` from `.gitignore` Per comment by @ggerganov * Remove `IDEWorkspaceChecks.plist` from root-level `.gitignore`	2024-06-19 22:10:42 +02:00
slaren	9c77ec1d74	ggml : synchronize threads using barriers (#7993 )	2024-06-19 15:04:15 +02:00
Georgi Gerganov	a04a953cab	codecov : remove (#8004 )	2024-06-19 13:04:36 +03:00
Meng, Hengyu	623494a478	[SYCL] refactor (#6408 ) * seperate lower precision GEMM from the main files * fix workgroup size hardcode	2024-06-19 09:11:51 +08:00
jaime-m-p	37bef89433	tokenizer : BPE fixes (#7530 ) * Random test: add_bos_token, add_eos_token * Random test: add BPE models for testing * Custom regex split fails with codepoint 0 * Fix falcon punctuation regex * Refactor llm_tokenizer_bpe: move code to constructor * Move 'add_special_bos/eos' logic to llm_tokenizer_bpe * Move tokenizer flags to vocab structure. * Default values for special_add_bos/eos * Build vocab.special_tokens_cache using vocab token types * Generalize 'jina-v2' per token attributes * Fix unicode whitespaces (deepseek-coder, deepseek-llm) * Skip missing byte tokens (falcon) * Better unicode data generation * Replace char32_t with uint32_t	2024-06-18 18:40:52 +02:00
Sigbjørn Skjæret	91c188d6c2	Only use FIM middle token if it exists (#7648 ) * Only use FIM middle if it exists * Only use FIM middle if it exists	2024-06-18 22:19:45 +10:00
jojorne	84f6de17f6	Fix no gcc pragma on Windows (#7751 )	2024-06-18 22:18:32 +10:00
Ulrich Drepper	61665277af	Allow compiling with CUDA without CUDA runtime installed (#7989 ) On hosts which are not prepared/dedicated to execute code using CUDA it is still possible to compile llama.cpp with CUDA support by just installing the development packages. Missing are the runtime libraries like /usr/lib64/libcuda.so* and currently the link step will fail. The development environment is prepared for such situations. There are stub libraries for all the CUDA libraries available in the $(CUDA_PATH)/lib64/stubs directory. Adding this directory to the end of the search path will not change anything for environments which currently work fine but will enable compiling llama.cpp also in case the runtime code is not available.	2024-06-18 14:00:14 +02:00
Frank Mai	b96f9afb0d	chore: clean useless beam search param (#7985 ) Signed-off-by: thxCode <thxcode0824@gmail.com>	2024-06-18 10:11:40 +03:00
Abheek Gulati	1193778105	readme : update UI list (#7943 )	2024-06-18 09:57:41 +03:00
Georgi Gerganov	5326bcceeb	ggml : sync	2024-06-18 09:50:45 +03:00
Georgi Gerganov	e6ecc2be47	whisper : use ggml_backend_sched (whisper/2239) * whisper : use ggml_backend_sched (wip) * use sched in whisper_allocr * whisper : single backend in whisper_context * whisper : remove whisper_state->backends_used * whisper : remove whisper_context->backend * whisper : reset scheduler after init * whisper : fix external encoder (e.g. CoreML) * whisper : cleanup * whisper : handle null GPU buffer types + fix sycl --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-06-18 09:50:40 +03:00
Ștefan-Gabriel Muscalu	a94e6ff877	update: support Qwen2-57B-A14B (#7835 ) * update: convert-hf-to-gguf.py to support Qwen2-57B-A14B * fix: QWEN2MOE support for expert_feed_forward_length previously, expert ff was taken from n_ff (intermediate size) but it is now properly taken from LLM_KV_EXPERT_FEED_FORWARD_LENGTH n_ff_exp and n_ff_shared_exp are now properly calculated * update: convert-hf-to-gguf.py cleanup for Qwen2MoeForCausalLM * fix: QWEN2MOE support for expert_feed_forward_length previously, expert ff was taken from n_ff (intermediate size) but it is now properly taken from LLM_KV_EXPERT_FEED_FORWARD_LENGTH n_ff_exp and n_ff_shexp are now properly calculated	2024-06-17 21:08:46 +02:00
Srihari-mcw	5b6da18750	Make updates to type cast based on compiler instead of OS (#7851 )	2024-06-17 20:23:17 +02:00
Georgi Gerganov	7c26775adb	llama : disable FA if KV head size do not match (#7982 )	2024-06-17 19:40:01 +03:00
Bryan Honof	b473e95084	Add Nix and Flox install instructions (#7899 )	2024-06-17 09:37:55 -06:00
slaren	99052cd227	sched : offload_op also requires supports_op (#7977 )	2024-06-17 16:51:42 +02:00
Frank Mai	c637fcd34d	fix: divide 0 exception in mamba (#7932 ) Signed-off-by: thxCode <thxcode0824@gmail.com>	2024-06-17 16:11:08 +02:00
Markus Tavenrath	6a2f0b3474	Implement non-mapped async IO for CUDA on Windows. (#7896 ) * Implement non-mapped async IO for CUDA on Windows. On a fast Gen5 NVMe drive this change improves model load time by >3x while it should be the same (or slightly faster) on any other drive. * Free resources except for backend. * Change assertions to exceptions in llama_file, find correct cuda backend to create CUDA resources and respect the use_mmap flag again for CUDA. * Apply suggestions from code review Co-authored-by: slaren <slarengh@gmail.com> * Fix editorconfig and unused variable * Fix issues with Windows build --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-06-17 16:10:15 +02:00
Georgi Gerganov	21be9cab94	rpc : fix load/store misaligned addresses (#7948 )	2024-06-17 11:09:20 +03:00
Brian	006167aaf6	gguf-dump.py: add --markdown dump output (#7853 ) * gguf-dump.py: add --markdown dump output * gguf-dump.py: Add toc * gguf-dump.py: use standard tensor name lookup. Also add tensor ID field * gguf-dump.py: Add tensor overview count * gguf-dump.py: fix array preview * gguf-dump.py: markdownTableWithAlignmentSupport() added * Add type hints and spacing Co-authored-by: compilade <git@compilade.net> * gguf-dump.py: prettyfy dimention * gguf-dump: right align element count * gguf-dump.py: element count autosizing * Apply suggestions from code review Co-authored-by: compilade <git@compilade.net> --------- Co-authored-by: compilade <git@compilade.net>	2024-06-17 15:25:20 +10:00
Neo Zhang	df68d4fa5d	[SYCL] Update README-sycl.md for Chapter "Recommended release" and "News" (#7946 ) * Update README-sycl.md * Update README-sycl.md * Update README-sycl.md * Update README-sycl.md	2024-06-17 11:17:07 +08:00
Calvin Laurenson	43b35e38ba	Add support for sqrt on CUDA (#7953 ) * cuda sqrt support * enable cuda in pca * fix comments in pca * add test * add sqrt to ggml_backend_cuda_supports_op * fix test * new line * Use F32 sqrtf instead of F64 sqrt Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2024-06-17 00:23:04 +02:00
Georgi Gerganov	19b7a836f6	cuda : fix bounds check for src0 rows in MMVQ kernel (whisper/2231) * cuda : fix bounds check for src0 rows in MMVQ kernel * Update ggml-cuda/mmvq.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2024-06-16 20:32:49 +03:00
Hong Bo PENG	b5fcf8ef5c	ggml : fix and optimize ppc64le (ggml/849) * fix compile issues introduced by loongarch_asx * restore quant changes to merge * fix compile issues introduced by loongarch_asx * further optimize by using vec_msum & vec_sum4s on ppc64le	2024-06-16 20:32:49 +03:00
Daniel Bevenius	398105ff43	ggml : remove duplicate include of ggml-common.h (ggml/853) Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-06-16 20:32:49 +03:00
Georgi Gerganov	bc6c457fa3	flake.lock: Update (#7951 )	2024-06-16 09:16:21 -07:00
Georgi Gerganov	52399254b3	unicode : avoid char32_t (#7957 ) ggml-ci	2024-06-16 14:51:40 +03:00
hopkins385	6fe1c62741	readme : update UI list [no ci] (#7958 )	2024-06-16 14:51:18 +03:00
Georgi Gerganov	cddaf028ad	ggml : fix handling of zero blocks in IQ quants (#7955 ) ggml-ci	2024-06-16 14:50:12 +03:00
Georgi Gerganov	c8a82194a8	github : update pr template	2024-06-16 10:46:51 +03:00
0cc4m	7c7836d9d4	Vulkan Shader Refactor, Memory Debugging Option (#7947 ) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use	2024-06-16 07:17:31 +02:00
Xuan Son Nguyen	0c7b3595b9	Add `cvector-generator` example (#7514 ) * add control-vector-generator * calc diff * add comments * proof-of-concept stdlib implementation Implements PCA and file writing using mostly standard libraries. The output is recognized as a functional control vector, but outputs gibberish. * param parsing, refactor, comments Added basic command-line parameters for outfile and one each positive/negative prompt. Refactored some messy code in PCA computation and GGUF exporting. Left a bunch of comments regarding further work needed. * example template completions Implements an example template set built from the positive/negative prompts like the control vector Python implementation. * add multi prompts, multi-thread for PCA * fix mem error * add debugs * fix matrix transpose multiplication you have got to be kidding me * preliminary template/multiprompt support model is running out of context and that ought to be fixed (segfaulting) but other than that it looks goodish * fix zero output & param parsing, functional templating fixed a bug where the output file had no tensor data/was all zero fixed a bug where single hyphen flags were not being correctly parsed implements creation of templated prompts from input (still need to adapt based on model) * fix square_diff matmul index range and CRLF->LF line endings fixed a logic error where square_diff would not multiply all rows fixed a formatting error where the provided completions.txt had CRLF line endings * add command-line args for num threads, num completions file lines, always reload model refactored a few things and did what the commit message says on the tin * code aestheticization * fix compiler warnings * in-series multithreading for prompt embedding? added commented-out code to attempt to start implementing mutlithreading for embedding in main * remove unnecessary multithreading * interim fix memory leak * translated everything but PCA (I think) * tentatively translate the rest * fix ggml errors and make new ones at least it compiles and runs * fix cb_eval * temporary commit while I move dev environments it finally outputs a functioning control vector - "functioning" in the sense that it can be loaded and it clearly has the right idea, but makes the model incoherent * update debug statements * pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped * update comments * (wip) refactor * clean up PCA ggml implementation * fix shape of v_diff_original * add n_batch for pca * working version * remember to copy back the last_eigenvector * fix n_completions * bring back n_completions * default n_pca_batch to 20 * fix macos build * add to makefile all targets * use ggml_format_name * add readme * fix .editorconfig * use ggml_backend_tensor_copy * attemp to fix compile problem on mac * fix compile warn * reuse allocr * move param parser to common * better error handling * clean up a bit * add print_usage * shorten help msg * beautify help msg * escape prompt by default * change compile target to llama-cvector-generator * typo * disable GPU for PCA * code style --------- Co-authored-by: Christian Zhou-Zheng <christianzhouzheng@gmail.com>	2024-06-15 18:53:40 +02:00
Meng, Hengyu	7b2f4a7d19	[SYCL] remove global variables (#7710 ) * separate DPCT helpers outside * replace global variables with context * remove useless extra * update mul_mat condition * remove duplicate buft initialization * remove duplicate extra and global work group size * remove useless backend check * remove duplicated extras * use macro for group_size and remove cuda-related	2024-06-15 14:05:10 +08:00
				`@@ -0,0 +1 @@`
				`[INST] Act like a person who is extremely sad. [/INST]`