tokenize : add --no-parse-special option

This should allow more easily explaining how parse_special affects tokenization.
Name Migration: Build the deprecation-warning 'main' binary every time (#8404 )
2026-07-01 01:57:43 +02:00 · 2024-07-10 18:06:25 -04:00 · 2024-07-10 12:35:18 -04:00 · 2024-07-10 16:10:49 +01:00 · 2024-07-10 15:23:29 +03:00 · 2024-07-10 15:14:51 +03:00
49 changed files with 3075 additions and 237 deletions
@@ -16,7 +16,9 @@ SYCL:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
-            - README-sycl.md
+            - ggml/src/ggml-sycl/**
+            - docs/backend/SYCL.md
+            - examples/sycl/**
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
@@ -50,9 +50,6 @@ endif()
 # option list
 #

-# general
-option(LLAMA_CCACHE "llama: use ccache if available" ON)
-
 # debug
 option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

 # override ggml options
-set(GGML_CCACHE             ${LLAMA_CCACHE})
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
@@ -115,7 +111,10 @@ llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #

-add_subdirectory(ggml)
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
 add_subdirectory(src)

 #
@@ -64,10 +64,14 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm

 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
+
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
 $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@@ -193,7 +197,7 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif

-default: $(BUILD_TARGETS)
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)

 test: $(TEST_TARGETS)
 	@failures=0; \
@@ -228,7 +232,7 @@ test: $(TEST_TARGETS)
 	fi
 	@echo 'All tests passed.'

-all: $(BUILD_TARGETS) $(TEST_TARGETS)
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)

 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
@@ -245,17 +249,22 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

-ifndef LLAMA_NO_CCACHE
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifndef GGML_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
-endif # LLAMA_NO_CCACHE
+endif # GGML_NO_CCACHE

 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -545,7 +554,7 @@ endif # GGML_BLIS

 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/sgemm.o
+	OBJ_GGML    += ggml/src/llamafile/sgemm.o
 endif

 ifdef GGML_RPC
@@ -826,7 +835,8 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
+	ggml/src/ggml-aarch64.o

 OBJ_LLAMA = \
 	src/llama.o \
@@ -926,6 +936,7 @@ $(info   - LLAMA_NO_LLAMAFILE)
 $(info   - LLAMA_NO_ACCELERATE)
 $(info   - LLAMA_NO_OPENMP)
 $(info   - LLAMA_NO_METAL)
+$(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

@@ -959,15 +970,22 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@

+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 ifndef GGML_NO_LLAMAFILE
-ggml/src/sgemm.o: \
-	ggml/src/sgemm.cpp \
-	ggml/src/sgemm.h \
+ggml/src/llamafile/sgemm.o: \
+	ggml/src/llamafile/sgemm.cpp \
+	ggml/src/llamafile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
@@ -1092,7 +1110,7 @@ clean:
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
-	rm -rvf $(LEGACY_TARGETS)
+	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -1488,3 +1506,61 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: main quantize perplexity embedding server finetune
+
+# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
+#  Eventually we will want to remove these target from building all the time.
+main: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+
+server: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+
+quantize: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard quantize))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "  Remove the 'quantize' binary to remove this warning."
+	@echo "#########"
+endif
+
+perplexity: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard perplexity))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "  Remove the 'perplexity' binary to remove this warning."
+	@echo "#########"
+endif
+
+embedding: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard embedding))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "  Remove the 'embedding' binary to remove this warning."
+	@echo "#########"
+endif
+
+finetune: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard finetune))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
+	@echo "  Remove the 'finetune' binary to remove this warning."
+	@echo "#########"
+endif
@@ -10,6 +10,7 @@ var sources = [
    "ggml/src/ggml-alloc.c",
    "ggml/src/ggml-backend.c",
    "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.c",
 ]

 var resources: [Resource] = []
@@ -98,7 +98,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)

-(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

 **Multimodal models:**

@@ -453,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
 - [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
+- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

 **Seminal papers and background on the models**
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "common.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
    buf << "[ ";

    bool first = true;
-    for (const auto &token : tokens)
+    for (const auto & token : tokens)
    {
        if (!first) {
            buf << ", ";
@@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    if (ctx_sampling->grammar != NULL && !apply_grammar) {
        GGML_ASSERT(original_logits != NULL);
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
    }

    // apply params.logit_bias map
@@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }

-    cur.clear();
+    cur.resize(n_vocab);

    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }

    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
@@ -1356,7 +1356,7 @@ class LlamaModel(Model):

    def set_vocab(self):
        try:
-            self. _set_vocab_sentencepiece()
+            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            try:
                self._set_vocab_llama_hf()
@@ -2144,6 +2144,9 @@ class InternLM2Model(Model):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
+            # take care of ununsed raw token
+            if piece.startswith('[UNUSED'):
+                toktype = SentencePieceTokenTypes.UNKNOWN

            tokens.append(text)
            scores.append(score)
@@ -2159,6 +2162,47 @@ class InternLM2Model(Model):
                    scores.append(-1000.0)
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

+        chat_eos_token = '<|im_end|>'
+        chat_eos_token_id = None
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+                for token_id, foken_data in added_tokens_decoder.items():
+                    token_id = int(token_id)
+                    token = foken_data["content"]
+                    if token == chat_eos_token:
+                        chat_eos_token_id = token_id
+                    token = token.encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                        assert(tokens[token_id] == token)
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        tokenizer_file = self.dir_model / 'tokenizer.json'
+        if tokenizer_file.is_file():
+            with open(tokenizer_file, "r", encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+                added_tokens = tokenizer_json.get("added_tokens", [])
+                for foken_data in added_tokens:
+                    token_id = int(foken_data["id"])
+                    token = foken_data["content"]
+                    if token == chat_eos_token:
+                        chat_eos_token_id = token_id
+                    token = token.encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                        assert(tokens[token_id] == token)
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
@@ -2168,28 +2212,16 @@ class InternLM2Model(Model):

        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        old_eos = special_vocab.special_token_ids["eos"]
-        if "chat" in os.path.basename(self.dir_model.absolute()):
+        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
-            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
-            logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
-in chat mode so that the conversation can end normally.")
+            special_vocab.special_token_ids["eos"] = chat_eos_token_id
+            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
+                           " in chat mode so that the conversation can end normally.")

        special_vocab.add_to_gguf(self.gguf_writer)

-    def _try_get_sft_eos(self, tokenizer):
-        unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
-        im_end_list = tokenizer.Encode('<|im_end|>')
-        eos_token = None
-        assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
-        if len(unused_145_list) == 1:
-            eos_token = unused_145_list[0]
-        if len(im_end_list) == 1:
-            eos_token = im_end_list[0]
-        assert eos_token
-        return eos_token
-
    def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
        if n_head_kv is not None and n_head != n_head_kv:
            n_head = n_head_kv
@@ -2208,6 +2240,10 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_file_type(self.ftype)
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
        ```

  - Notes:
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
@@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.

  **Notes**:

+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
@@ -0,0 +1,51 @@
+# Migration notice for binary filenames
+
+> [!IMPORTANT]
+[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+
+This migration was important, but it is a breaking change that may not always be immediately obvious to users.
+
+Please update all scripts and workflows to use the new binary names.
+
+| Old Filename | New Filename |
+| ---- | ---- |
+| main | llama-cli |
+| server | llama-server |
+| llama-bench | llama-bench |
+| embedding | llama-embedding |
+| finetune | llama-finetune |
+| quantize | llama-quantize |
+| tokenize | llama-tokenize |
+| export-lora | llama-export-lora |
+| libllava.a | libllava.a |
+| baby-llama | llama-baby-llama |
+| batched | llama-batched |
+| batched-bench | llama-batched-bench |
+| benchmark-matmult | llama-benchmark-matmult |
+| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
+| eval-callback | llama-eval-callback |
+| gbnf-validator | llama-gbnf-validator |
+| gguf | llama-gguf |
+| gguf-split | llama-gguf-split |
+| gritlm | llama-gritlm |
+| imatrix | llama-imatrix |
+| infill | llama-infill |
+| llava-cli | llama-llava-cli |
+| lookahead | llama-lookahead |
+| lookup | llama-lookup |
+| lookup-create | llama-lookup-create |
+| lookup-merge | llama-lookup-merge |
+| lookup-stats | llama-lookup-stats |
+| parallel | llama-parallel |
+| passkey | llama-passkey |
+| perplexity | llama-perplexity |
+| q8dot | llama-q8dot |
+| quantize-stats | llama-quantize-stats |
+| retrieval | llama-retrieval |
+| save-load-state | llama-save-load-state |
+| simple | llama-simple |
+| speculative | llama-speculative |
+| train-text-from-scratch | llama-train-text-from-scratch |
+| vdot | llama-vdot |
+| tests/test-c.o | tests/test-c.o |
+
@@ -0,0 +1,35 @@
+// Warns users that this filename was deprecated, and provides a link for more information.
+
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+// Main
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    auto pos = filename.find_last_of('/');
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    // Append "llama-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "llama-" + filename;
+
+    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    if (filename == "main") {
+        replacement_filename = "llama-cli";
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
+    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
@@ -204,21 +204,17 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

-    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-        params.input_suffix.erase(0, 1);
-        suff_rm_leading_spc = false;
-    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-        inp_sfx.erase(inp_sfx.begin());
-    }
+
+    GGML_ASSERT(llama_token_prefix(model) >= 0);
+    GGML_ASSERT(llama_token_suffix(model) >= 0);
+
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+
    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
@@ -516,19 +512,14 @@ int main(int argc, char ** argv) {
                    string_process_escapes(params.input_prefix);
                    string_process_escapes(params.input_suffix);
                }
-                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-                    params.input_suffix.erase(0, 1);
-                    suff_rm_leading_spc = false;
-                }
+
                // tokenize new prefix and suffix
                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-                    inp_sfx.erase(inp_sfx.begin());
-                }
+
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+
                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
@@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
@@ -884,7 +884,8 @@ struct server_context {

    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
+        llama_sampling_params default_sparams = params.sparams;
        auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
@@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
    // variables where to put any arguments we see.
    bool printing_ids = false;
    bool no_bos = false;
+    bool no_parse_special = false;
    bool disable_logging = false;
    bool show_token_count = false;
    const char * model_path = NULL;
@@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--no-bos") {
            no_bos = true;
        }
+        else if (arg == "--no-parse-special") {
+            no_parse_special = true;
+        }
        else if (arg == "-p" || arg == "--prompt") {
            if (prompt_set) {
                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {

    const bool model_wants_add_bos = llama_should_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
+    const bool parse_special = !no_parse_special;

    std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);

    if (printing_ids) {
        printf("[");
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1717285511,
-        "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
+        "lastModified": 1719994518,
+        "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
+        "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1719506693,
-        "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
+        "lastModified": 1720031269,
+        "narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
+        "rev": "9f4128e00b0ae8ec65918efeba59db998750ead6",
        "type": "github"
      },
      "original": {
@@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1717284937,
-        "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
+        "lastModified": 1719876945,
+        "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
      }
    },
    "root": {
@@ -104,7 +104,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             OFF)

 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
@@ -383,6 +383,9 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
+        GGML_TYPE_Q4_0_4_4 = 31,
+        GGML_TYPE_Q4_0_4_8 = 32,
+        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_COUNT,
    };

@@ -424,6 +427,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };

    // available tensor operations:
@@ -2406,6 +2412,12 @@ extern "C" {
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
+                                             int64_t k, int64_t bx);
+    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);
+    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
+                                      const void * GGML_RESTRICT y, int nr, int nc);

    typedef struct {
        const char      * type_name;
@@ -2418,6 +2430,11 @@ extern "C" {
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
        int64_t           nrows; // number of rows to process simultaneously;
+        int64_t           ncols; // number of columns to process simultaneously;
+        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_gemv_t       gemv;
+        ggml_gemm_t       gemm;
    } ggml_type_traits_t;

    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
@@ -238,12 +238,12 @@ if (GGML_BLAS)
 endif()

 if (GGML_LLAMAFILE)
-    message(STATUS "Using ggml SGEMM")
+    message(STATUS "Using llamafile")

    add_compile_definitions(GGML_USE_LLAMAFILE)

-    set(GGML_HEADERS_LLAMAFILE sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
+    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
 endif()

 if (GGML_CUDA)
@@ -1153,6 +1153,7 @@ add_library(ggml
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ggml-aarch64.c            ggml-aarch64.h
            )

 if (EMSCRIPTEN)
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
+
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// GEMV
+void ggml_gemv_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// GEMM
+void ggml_gemm_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+#ifdef __cplusplus
+}
+#endif
+
@@ -199,6 +199,30 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

+typedef struct {
+    ggml_half d[4];        // deltas for 4 q4_0 blocks
+    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
+} block_q4_0x4;
+static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
+
+typedef struct {
+    ggml_half d[8];        // deltas for 8 q4_0 blocks
+    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
+} block_q4_0x8;
+static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
+
+typedef struct {
+    ggml_half d[4];        // deltas for 4 q8_0 blocks
+    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
+} block_q8_0x4;
+static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
+
+typedef struct {
+    ggml_half d[8];        // deltas for 8 q8_0 blocks
+    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
+} block_q8_0x8;
+static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
+
 //
 // Super-block quantization structures
 //
@@ -29,6 +29,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
+#include "ggml-cuda/conv-transpose-1d.cuh"

 #include <algorithm>
 #include <array>
@@ -2261,6 +2262,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_IM2COL:
            ggml_cuda_op_im2col(ctx, dst);
            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_cuda_op_conv_transpose_1d(ctx,dst);
+            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@@ -2804,6 +2808,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                ggml_type src0_type = op->src[0]->type;
                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                return false;
+            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
@@ -0,0 +1,87 @@
+#include "conv-transpose-1d.cuh"
+
+static  __global__ void conv_transpose_1d_kernel(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst) {
+    int global_index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (global_index >= output_size) {
+        return;
+    }
+
+    int out_index = global_index / dst_ne0;
+
+    float accumulator = 0;
+
+    for (int c = 0; c < src0_ne2; c++) {
+        int idx = global_index % dst_ne0;
+
+        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+        int input_offset = src1_ne0 * c;
+
+        for (int i = 0; i < src1_ne0; i++) {
+            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+                continue;
+            }
+            int weight_idx = idx - i*s0;
+
+            float kernel_weight = src0[kernel_offset + weight_idx];
+            float input_value =  src1[input_offset+i];
+
+            accumulator += kernel_weight * input_value;
+        }
+    }
+    dst[global_index] = accumulator;
+}
+
+static void conv_transpose_1d_f32_f32_cuda(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst,
+        cudaStream_t stream) {
+
+    const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
+    conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
+        s0,p0,d0,output_size,
+        src0_ne0, src0_ne1,  src0_ne2, src0_ne3,
+        src1_ne0, src1_ne1,  src1_ne2, src1_ne3,
+        dst_ne0,  dst_ne1,   dst_ne2,  dst_ne3,
+        src0,src1, dst);
+}
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int s0 = opts[0];
+    const int p0 = 0;//opts[3];
+    const int d0 = 1;//opts[4];
+
+    const int64_t kernel_size = ggml_nelements(src0);
+    const int64_t input_size = ggml_nelements(src1);
+    const int64_t output_size = ggml_nelements(dst);
+
+    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+        src0_d, src1_d, dst_d, stream);
+}
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {

 #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)

+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
@@ -3814,43 +3814,47 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    }
 #endif
 #if defined(__ARM_FEATURE_SVE)
-    const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
-    const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
+    if (svcntb() == QK8_0) {
+        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
+        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);

-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+        svfloat32_t sumv1 = svdup_n_f32(0.0f);

-    assert(nb % 2 == 0); // TODO: handle odd nb
+        assert(nb % 2 == 0); // TODO: handle odd nb

-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        for (int i = 0; i < nb; i += 2) {
+            const block_q4_0 * restrict x0 = &x[i + 0];
+            const block_q4_0 * restrict x1 = &x[i + 1];
+            const block_q8_0 * restrict y0 = &y[i + 0];
+            const block_q8_0 * restrict y1 = &y[i + 1];

-        // load x
-        const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-        const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+            // load x
+            const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+            const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);

-        // 4-bit -> 8-bit
-        const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
-        const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+            // 4-bit -> 8-bit
+            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
+            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));

-        // sub 8
-        const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
-        const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+            // sub 8
+            const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+            const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);

-        // load y
-        const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-        const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+            // load y
+            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

-        // dot product
-        sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+            // dot product
+            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        }
+
+        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+        return;
    }
-
-    *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+#endif
+#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

@@ -5422,31 +5426,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    }
 #endif
 #if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+    if (svcntb() == QK8_0) {
+        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+        svfloat32_t sumv1 = svdup_n_f32(0.0f);

-    assert(nb % 2 == 0); // TODO: handle odd nb
+        assert(nb % 2 == 0); // TODO: handle odd nb

-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        for (int i = 0; i < nb; i += 2) {
+            const block_q8_0 * restrict x0 = &x[i + 0];
+            const block_q8_0 * restrict x1 = &x[i + 1];
+            const block_q8_0 * restrict y0 = &y[i + 0];
+            const block_q8_0 * restrict y1 = &y[i + 1];

-        // load x
-        const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
-        const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+            // load x
+            const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+            const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);

-        // load y
-        const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-        const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+            // load y
+            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

-        sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        }
+
+        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+        return;
    }
-
-    *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+#endif
+#if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);

@@ -14760,6 +14768,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
        } \
    }

+#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        for (size_t j = 0; j < (nr); ++j) { \
+            if (!validate_fp16(q[i].d[j], i)) { \
+                return false; \
+            } \
+        } \
+    }
+
 bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
    if (type < 0 || type >= GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid type %d\n", __func__, type);
@@ -14977,6 +14995,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
            } break;
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+            {
+                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
+            } break;
+        case GGML_TYPE_Q4_0_8_8:
+            {
+                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
+            } break;
+
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
@@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // SYCL_USE_XMX

+    // mmvq path is faster in the CUDA backend.
+    if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
+        use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+
    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
@@ -346,4 +346,10 @@ inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
    return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
 }

+// Helper for accessing pointers with no warnings
+template <typename Tp, int dim>
+static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
+    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
 #endif // GGML_SYCL_COMMON_HPP
@@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
                                                   sycl::range<3>(1, 1, 32),
                                               sycl::range<3>(1, 1, 32)),
                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1);
+                                 dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
                             });
        });
    }
@@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_acc_ct1),
+                            get_pointer(tile_x_dm_acc_ct1),
+                            get_pointer(tile_x_sc_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_x_ql_acc_ct1),
+                            get_pointer(tile_x_dm_acc_ct1),
+                            get_pointer(tile_x_sc_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
                [=](sycl::nd_item<3> item_ct1)
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    group_norm_f32(x, dst, group_size, ne_elements,
                        eps_ct4, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
                [=](sycl::nd_item<3> item_ct1)
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@@ -55,7 +55,7 @@ static void rope_norm(
    const int i = row*ne0 + i0;
    const int i2 = row/p_delta_rows;

-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;

@@ -98,7 +98,7 @@ static void rope_neox(
    const int i  = row*ne0 + i0/2;
    const int i2 = row/p_delta_rows;

-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;

@@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
                soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
                                                                             nrows_y, scale, max_bias, m0,
                                                                             m1, n_head_log2, item_ct1,
-                                                                             local_buf_acc.get_pointer());
+                                                                             get_pointer(local_buf_acc));
            });
    });
 }
@@ -4,7 +4,7 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
-
+#include "ggml-aarch64.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -37,12 +37,12 @@
 #include <unistd.h>
 #endif

-#ifdef __ARM_FEATURE_MATMUL_INT8
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif

 #ifdef GGML_USE_LLAMAFILE
-#include "sgemm.h"
+#include <llamafile/sgemm.h>
 #endif

 #if defined(_MSC_VER)
@@ -692,6 +692,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 #else
        .nrows                    = 1,
 #endif
+        .from_float_to_mat        = quantize_mat_q8_0,
    },
    [GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
@@ -889,6 +890,54 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
        .vec_dot_type             = GGML_TYPE_BF16,
        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0_4_4] = {
+        .type_name                = "q4_0_4x4",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .interleave_blcksize      = 4,
+        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
+    },
+    [GGML_TYPE_Q4_0_4_8] = {
+        .type_name                = "q4_0_4x8",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 4,
+        .interleave_blcksize      = 8,
+        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
+    },
+    [GGML_TYPE_Q4_0_8_8] = {
+        .type_name                = "q4_0_8x8",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+        .ncols                    = 8,
+        .interleave_blcksize      = 8,
+        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
+        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
    }
 };

@@ -3188,6 +3237,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
+        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
+        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
+        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@@ -9432,6 +9484,9 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
@@ -9807,6 +9862,9 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
@@ -9932,6 +9990,9 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ASSERT(false);
@@ -12134,6 +12195,12 @@ static void ggml_compute_forward_mul_mat(
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
+    int64_t           const matmul_num_cols       = type_traits[type].ncols;
+    int64_t           const interleave_blcksize   = type_traits[type].interleave_blcksize;
+    ggml_from_float_to_mat_t const from_float_to_mat
+                                                  = type_traits[vec_dot_type].from_float_to_mat;
+    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
+    ggml_gemm_t       const gemm                  = type_traits[type].gemm;

    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
@@ -12192,7 +12259,16 @@ UseGgmlGemm1:;

        for (int64_t i13 = 0; i13 < ne13; ++i13) {
            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                int64_t i11_processed = 0;
+                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
+                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                          4, ne10, interleave_blcksize);
+                    }
+                    i11_processed = ne11 - ne11 % 4;
+                }
+                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
                    from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
                                           ne10);
@@ -12273,6 +12349,28 @@ UseGgmlGemm2:;
    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

+    if ((ggml_n_dims(src0) == 2) && gemv) {
+        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
+        int64_t src0_start = (ith * ne01) / nth;
+        int64_t src0_end   = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
+        src0_end   = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
+        if (src0_start >= src0_end) return;
+
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (gemm && (ne11 > 3)) {
+            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
+                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
+            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                 src0_end - src0_start);
+        }
+        return;
+    }
+
    // The first chunk comes from our thread_id, the rest will get auto-assigned.
    int current_chunk = ith;

@@ -12318,6 +12416,8 @@ static void ggml_compute_forward_mul_mat_id(
    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
+    int64_t           const matmul_num_cols       = type_traits[type].ncols;
+    ggml_gemv_t       const gemv                  = type_traits[type].gemv;

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -12403,6 +12503,34 @@ static void ggml_compute_forward_mul_mat_id(
        const int64_t nr0 = ne01; // src0 rows
        const int64_t nr1 = cne1; // src1 rows

+        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
+            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
+            if (src0_cur_start >= src0_cur_end) return;
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+                const int id       = row_mapping.i1; // selected expert index
+
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                    ? (i11        + i12 * ne11) * row_size
+                    : (i11 * nb11 + i12 * nb12));
+
+                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+            continue;
+        }
+
        // distribute the thread work across the inner or outer loop based on which one is larger

        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@@ -12704,6 +12832,9 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
@@ -12889,6 +13020,9 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ASSERT(false);
@@ -13148,6 +13282,9 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
@@ -13734,6 +13871,9 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
+        case GGML_TYPE_Q4_0_4_4:
+        case GGML_TYPE_Q4_0_4_8:
+        case GGML_TYPE_Q4_0_8_8:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
@@ -20457,6 +20597,9 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
@@ -21759,8 +21902,6 @@ int ggml_cpu_has_neon(void) {

 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
-    // TODO: Currently, SVE 256 bit is only supported.
-    GGML_ASSERT(svcntb() == QK8_0);
    return 1;
 #else
    return 0;
@@ -79,5 +79,4 @@ python -m twine upload dist/*
 ```

 ## TODO
- [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
@@ -6,7 +6,6 @@ from typing import Any, Callable
 from collections import deque

 import numpy as np
-from numpy._typing import _Shape
 from numpy.typing import DTypeLike


@@ -219,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray

    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
        # but non-float types like np.int16 can't use that.
        # So zero it is.
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.9.0"
+version = "0.9.1"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.

 ## Background

-[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.

 ## Basics

@@ -162,6 +162,9 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -63,6 +63,7 @@ while read c; do
        src/ggml*.metal \
        src/ggml*.cu \
        src/ggml-cuda/* \
+        src/ggml-sycl/* \
        include/ggml*.h \
        tests/test-opt.cpp \
        tests/test-grad0.cpp \
@@ -113,6 +114,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-quants.c       -> ggml/src/ggml-quants.c
    # src/ggml-quants.h       -> ggml/src/ggml-quants.h
    # src/ggml-rpc.cpp        -> ggml/src/ggml-rpc.cpp
+    # src/ggml-sycl/*         -> ggml/src/ggml-sycl/
    # src/ggml-sycl.cpp       -> ggml/src/ggml-sycl.cpp
    # src/ggml-vulkan.cpp     -> ggml/src/ggml-vulkan.cpp
    #
@@ -153,6 +155,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
@@ -1 +1 @@
-5378ea0d3c2f25bcd330ecb226ad2db454be86d0
+e3b3846976c94163f2b3dd128cc959782653edbb
@@ -18,6 +18,7 @@ cp -rpv ../ggml/src/ggml-metal.metal    ./ggml/src/ggml-metal.metal
 cp -rpv ../ggml/src/ggml-quants.c       ./ggml/src/ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h       ./ggml/src/ggml-quants.h
 cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
+cp -rpv ../ggml/src/ggml-sycl/*         ./ggml/src/ggml-sycl/
 cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml/src/ggml-sycl.cpp
 cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml/src/ggml-vulkan.cpp

@@ -57,6 +57,12 @@
    #include <io.h>
 #endif

+#if __cplusplus >= 202000L
+    #define LU8(x) (const char*)(u8##x)
+#else
+    #define LU8(x) u8##x
+#endif
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -3782,6 +3788,9 @@ struct llama_model_loader {
                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
+                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
+                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -4475,6 +4484,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";

        default: return "unknown, may not work";
    }
@@ -13200,6 +13212,8 @@ struct llm_build_context {
                    LLM_NORM_RMS, cb, -1);
            cb(cur, "result_norm", -1);
        } else {
+            GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+
            struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
            struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);

@@ -17760,6 +17774,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                new_type = GGML_TYPE_IQ3_S;
            }
+            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
+                     new_type == GGML_TYPE_Q4_0_8_8) {
+                new_type = GGML_TYPE_Q4_0;
+            }
        }
    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -18072,6 +18090,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@@ -18382,6 +18403,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }

+            int chunk_size_multiplier = 1;
+            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
+                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
+                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
+                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
+                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
+            }
+
            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
            fflush(stdout);

@@ -18394,7 +18423,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const int64_t nrows = tensor->ne[1];

            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
+                                       chunk_size_multiplier;

            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -21509,12 +21539,12 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
+    } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "user") {
-                ss << u8"<用户>";
+                ss << LU8("<用户>");
                ss << trim(message->content);
                ss << "<AI>";
            } else {
@@ -21530,7 +21560,7 @@ static int32_t llama_chat_apply_template_internal(
            } else if (role == "user") {
                ss << "User: " << message->content << "\n\n";
            } else if (role == "assistant") {
-                ss << "Assistant: " << message->content << u8"<｜end▁of▁sentence｜>";
+                ss << "Assistant: " << message->content << LU8("<｜end▁of▁sentence｜>");
            }
        }
        if (add_ass) {
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "unicode.h"
 #include "unicode-data.h"

@@ -1266,6 +1266,32 @@ struct test_pool2d : public test_case {
    }
 };

+// GGML_OP_CONV_TRANSPOSE_1D
+struct test_conv_transpose_1d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+
+    const int s0; // stride
+    const int p0; // padding
+    const int d0; // dilation
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+    }
+
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int s0 = 1, int p0 = 0, int d0 = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
+        return out;
+    }
+};
+
 // GGML_OP_IM2COL
 struct test_im2col : public test_case {
    const ggml_type type_input;
@@ -1279,7 +1305,7 @@ struct test_im2col : public test_case {
    // padding
    const int p0;
    const int p1;
-    // dilatation
+    // dilation
    const int d0;
    const int d1;
    // mode
@@ -2098,6 +2124,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));

+    test_cases.emplace_back(new test_conv_transpose_1d());
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+
+
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
Author	SHA1	Message	Date
Francis Couture-Harpin	ba06b2deb7	tokenize : add --no-parse-special option This should allow more easily explaining how parse_special affects tokenization.	2024-07-10 18:06:25 -04:00
Clint Herron	dd07a123b7	Name Migration: Build the deprecation-warning 'main' binary every time (#8404 ) * Modify the deprecation-warning 'main' binary to build every time, instead of only when a legacy binary is present. This is to help users of tutorials and other instruction sets from knowing what to do when the 'main' binary is missing and they are trying to follow instructions. * Adjusting 'server' name-deprecation binary to build all the time, similar to the 'main' legacy name binary.	2024-07-10 12:35:18 -04:00
AidanBeltonS	f4444d992c	[SYCL] Use multi_ptr to clean up deprecated warnings (#8256 )	2024-07-10 16:10:49 +01:00
Georgi Gerganov	6b2a849d1f	ggml : move sgemm sources to llamafile subfolder (#8394 ) ggml-ci	2024-07-10 15:23:29 +03:00
Dibakar Gope	0f1a39f343	ggml : add AArch64 optimized GEMV and GEMM Q4 kernels (#5780 ) * Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aarch64.h files * Arm AArch64: minor code refactoring for rebase * Arm AArch64: minor code refactoring for resolving a build issue with cmake * Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code change for resolving a build issue with server-windows * retrigger checks * Arm AArch64: minor code changes for rebase * Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm cpus with SVE VL not equal to 256 bits * Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete build.zig * Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code refactoring * Arm AArch64: simplify logic for calling gemm and gemv functions in ggml_compute_forward_mul_mat * Arm AArch64: minimize changes in ggml_compute_forward_mul_mat * Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * rebase on the latest master commit `3fd62a6` and adapt to the new directory structure * Arm AArch64: remove a redundant comment * Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-strings warning off * Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels * Arm AArch64: update docs/build.md README to include compile time flags for buiilding the Q4_0_4_4 quant type	2024-07-10 15:14:51 +03:00
M. Yusuf Sarıgöz	83321c6958	gguf-py rel pipeline (#8410 ) * Upd gguf-py/readme * Bump patch version for release	2024-07-10 15:12:35 +03:00
Borislav Stanimirov	cc61948b1f	llama : C++20 compatibility for u8 strings (#8408 )	2024-07-10 14:45:44 +03:00
Borislav Stanimirov	7a80710d93	msvc : silence codecvt c++17 deprecation warnings (#8395 )	2024-07-10 14:40:53 +03:00
fairydreaming	a8be1e6f59	llama : add assert about missing llama_encode() call (#8400 ) Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2024-07-10 14:38:58 +03:00
RunningLeon	e4dd31ff89	py : fix converter for internlm2 (#8321 ) * update internlm2 * remove unused file * fix lint	2024-07-10 14:26:40 +03:00
laik	8f0fad42b9	py : fix extra space in convert_hf_to_gguf.py (#8407 )	2024-07-10 14:19:10 +03:00
Clint Herron	a59f8fdc85	Server: Enable setting default sampling parameters via command-line (#8402 ) * Load server sampling parameters from the server context by default. * Wordsmithing comment	2024-07-09 18:26:40 -04:00
Andy Salerno	fd560fe680	Update README.md to fix broken link to docs (#8399 ) Update the "Performance troubleshooting" doc link to be correct - the file was moved into a dir called 'development'	2024-07-09 14:58:44 -04:00
Clint Herron	e500d6135a	Deprecation warning to assist with migration to new binary names (#8283 ) * Adding a simple program to provide a deprecation warning that can exist to help people notice the binary name change from #7809 and migrate to the new filenames. * Build legacy replacement binaries only if they already exist. Check for their existence every time so that they are not ignored.	2024-07-09 11:54:43 -04:00
Johannes Gäßler	a03e8dd99d	make/cmake: LLAMA_NO_CCACHE -> GGML_NO_CCACHE (#8392 )	2024-07-09 17:11:07 +02:00
Alberto Cabrera Pérez	5b0b8d8cfb	sycl : Reenabled mmvq path for the SYCL Nvidia Backend (#8372 ) * SYCL : Reenabled mmvq path for the SYCL Nvidia Backend * Reduced verbosity of comment	2024-07-09 22:03:15 +08:00
Borislav Stanimirov	9925ca4087	cmake : allow external ggml (#8370 )	2024-07-09 11:38:00 +03:00
daghanerdonmez	9beb2dda03	readme : fix typo [no ci] (#8389 ) Bakus-Naur --> Backus-Naur	2024-07-09 09:16:00 +03:00
compilade	7d0e23d72e	gguf-py : do not use internal numpy types (#7472 )	2024-07-09 01:04:49 -04:00
Georgi Gerganov	7fdb6f73e3	flake.lock: Update (#8342 ) Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8?narHash=sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw%3D' (2024-06-01) → 'github:hercules-ci/flake-parts/9227223f6d922fee3c7b190b2cc238a99527bbb7?narHash=sha256-pQMhCCHyQGRzdfAkdJ4cIWiw%2BJNuWsTX7f0ZYSyz0VY%3D' (2024-07-03) • Updated input 'flake-parts/nixpkgs-lib': 'https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz?narHash=sha256-lIbdfCsf8LMFloheeE6N31%2BBMIeixqyQWbSr2vk79EQ%3D' (2024-06-01) → 'https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz?narHash=sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI%3D' (2024-07-01) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/b2852eb9365c6de48ffb0dc2c9562591f652242a?narHash=sha256-C8e9S7RzshSdHB7L%2Bv9I51af1gDM5unhJ2xO1ywxNH8%3D' (2024-06-27) → 'github:NixOS/nixpkgs/9f4128e00b0ae8ec65918efeba59db998750ead6?narHash=sha256-rwz8NJZV%2B387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ%3D' (2024-07-03) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-07-08 15:36:38 -07:00
Alberto Cabrera Pérez	a130eccef4	labeler : updated sycl to match docs and code refactor (#8373 )	2024-07-08 22:35:17 +02:00
b4b4o	c4dd11d1d3	readme : fix web link error [no ci] (#8347 )	2024-07-08 17:19:24 +03:00
Alberto Cabrera Pérez	2ec846d558	sycl : fix powf call in device code (#8368 )	2024-07-08 14:22:41 +01:00
Georgi Gerganov	3f2d538b81	scripts : fix sync for sycl	2024-07-08 13:51:31 +03:00
Georgi Gerganov	2ee44c9a18	sync : ggml ggml-ci	2024-07-08 12:23:00 +03:00
Georgi Gerganov	6847d54c4f	tests : fix whitespace (#0 )	2024-07-08 12:23:00 +03:00
John Balis	fde13b3bb9	feat: cuda implementation for `ggml_conv_transpose_1d` (ggml/854) * conv transpose 1d passing test for 1d input and kernel * working for different input and output channel counts, added test for variable stride * initial draft appears to work with stride other than 1 * working with all old and new conv1d tests * added a test for large tensors * removed use cuda hardcoding * restored test-conv-transpose.c * removed unused arugments, and fixed bug where test failure would cause subsequent tests to fail * fixed accumulator bug * added test to test-backend-ops * fixed mistake * addressed review * fixed includes * removed blank lines * style and warning fixes * return failure when test fails * fix supports_op --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-07-08 12:23:00 +03:00
Kevin Wang	470939d483	common : preallocate sampling token data vector (#8363 ) `emplace_back` repeatedly-called is slower than preallocating the vector to the vocab size and directly inserting the data. Some rudimentary profiling with `chrono` improves the performance of this block of code from ~500us/op to ~40us/op. Overall, this slightly improves the sampling performance which has a more substantial impact for the `examples/lookahead` implementation -- I am able to see a ~10% performance boost in lookahead inference.	2024-07-08 10:26:53 +03:00
Georgi Gerganov	6f0dbf6ab0	infill : assert prefix/suffix tokens + remove old space logic (#8351 )	2024-07-08 09:34:35 +03:00