readme : remove trailing space (#7469 )

ggml : silence UB sanitizer error during iq2_xxs quantization (#0 )
Fix phi3 chat template confusion with zephyr (#7449 )
2026-06-17 02:57:39 +02:00 · 2024-05-23 17:43:18 +03:00 · 2024-05-23 17:25:38 +03:00 · 2024-05-23 16:15:15 +02:00 · 2024-05-23 15:30:13 +03:00 · 2024-05-23 15:29:26 +03:00
25 changed files with 678 additions and 4983 deletions
@@ -62,6 +62,8 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
+            - ggml.c
+            - ggml.h
            - ggml-*.c
            - ggml-*.h
            - ggml-cuda/**
@@ -71,3 +73,6 @@ nix:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
+embedding:
+    - changed-files:
+        - any-glob-to-any-file: examples/embedding/
@@ -124,7 +124,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
-option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@@ -384,10 +383,6 @@ if (LLAMA_LLAMAFILE)
    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()

-if (LLAMA_QKK_64)
-    add_compile_definitions(GGML_QKK_64)
-endif()
-
 if (LLAMA_CUBLAS)
    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
    set(LLAMA_CUDA ON)
@@ -389,10 +389,6 @@ else
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif

-ifdef LLAMA_QKK_64
-	MK_CPPFLAGS += -DGGML_QKK_64
-endif
-
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

@@ -140,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
+- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)

 **HTTP server**

@@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
-    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_3b ]]; then
-        echo -n "$gguf_3b"
-    elif [[ -s $gguf_7b ]]; then
-        echo -n "$gguf_7b"
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_0 ]]; then
+        echo -n "$gguf_0"
+    elif [[ -s $gguf_1 ]]; then
+        echo -n "$gguf_1"
+    elif [[ -s $gguf_2 ]]; then
+        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

-# open_llama_3b_v2
-
-function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_open_llama_3b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 3B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA

@@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert.py ${path_models}
+    python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }

+# pythia_1.4b
+
+function gg_run_pythia_1_4b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/pythia/1.4B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_pythia_1_4b {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Pythia 1.4B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
+# pythia_2_8b
+# requires: GG_BUILD_CUDA
+
+function gg_run_pythia_2_8b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+
+    path_models="../models-mnt/pythia/2.8B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test="${path_wiki}/wiki.test.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_pythia_2_8b {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Pythia 2.8B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
 # bge-small

 function gg_run_embd_bge_small {
@@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models}
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
+            test $ret -eq 0 && gg_run pythia_1_4b
        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run pythia_2_8b
+            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
@@ -673,6 +673,44 @@ class GPTNeoXModel(Model):
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])

+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
+            data_torch = torch.cat(
+                (
+                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.weight")
+        elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
+            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
+            data_torch = torch.cat(
+                (
+                    qkv_bias[:, 0, :].reshape((n_embed,)),
+                    qkv_bias[:, 1, :].reshape((n_embed,)),
+                    qkv_bias[:, 2, :].reshape((n_embed,)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.bias")
+
+        tensors.append((self.map_tensor_name(name), data_torch))
+
+        return tensors
+

@Model.register("BloomForCausalLM")
 class BloomModel(Model):
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
+            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
+            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
@@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
+            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
 // QK = number of values after dequantization
 // QK_K = super-block size

-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
 #define QK_K 256
 #define K_SCALE_SIZE 12
-#endif // GGML_QKK_64

 #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
 // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
 #define QI4_NL (QK4_NL / (4*QR4_NL))
 #define QR4_NL 2

-#if QK_K == 64
-#define QI4_XS QI4_NL
-#define QR4_XS QR4_NL
-#else
 #define QI4_XS (QK_K / (4*QR4_XS))
 #define QR4_XS 8
-#endif

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    uint8_t hmask[QK_K/8]; // quants - high bit
-    uint8_t qs[QK_K/4];    // quants - low 2 bits
-    uint8_t scales[2];
-    ggml_half d;           // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
-#else
 typedef struct {
    uint8_t hmask[QK_K/8]; // quants - high bit
    uint8_t qs[QK_K/4];    // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
    ggml_half d;           // super-block scale
 } block_q3_K;
 static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
-#endif

 // 4-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_half d[2];     // super-block scales/mins
-    uint8_t scales[2];  // 4-bit block scales/mins
-    uint8_t qs[QK_K/2]; // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
 typedef struct {
    union {
        struct {
@@ -270,21 +242,11 @@ typedef struct {
    uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
-#endif

 // 5-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_half d;             // super-block scale
-    int8_t  scales[QK_K/16]; // 8-bit block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
 typedef struct {
    union {
        struct {
@@ -298,7 +260,6 @@ typedef struct {
    uint8_t qs[QK_K/2];           // quants, low 4 bits
 } block_q5_K;
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif

 // 6-bit quantization
 // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");

 // 3.4375 bpw
-#if QK_K == 64
-#define IQ3S_N_SCALE 2
-#else
 #define IQ3S_N_SCALE QK_K/64
-#endif
 typedef struct {
    ggml_half d;
    uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
 typedef struct {
    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
-#if QK_K == 64
-    ggml_half d;
-#endif
    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
 } block_iq1_m;
-#if QK_K == 64
-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
-#else
 static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
-#endif

 // Used by IQ1_M quants
 typedef union {
@@ -406,9 +356,6 @@ typedef struct {
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");

-#if QK_K == 64
-#define block_iq4_xs block_iq4_nl
-#else
 typedef struct {
    ggml_half d;
    uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
    uint8_t  qs[QK_K/2];
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
-#endif

 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
    const block_q2_K * x = (const block_q2_K *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t n   = tid/32;
    const int64_t l   = tid - 32*n;
    const int64_t is  = 8*n + l/16;
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int64_t is = tid/16;  // 0 or 1
-    const int64_t il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
 }

 template<typename dst_t>
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

-#if QK_K == 256
    const int64_t r = threadIdx.x/4;
    const int64_t tid = r/2;
    const int64_t is0 = r%2;
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const uint8_t * hm = x[i].hmask;

    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int64_t tid = threadIdx.x;
-    const int64_t is  = tid/16;  // 0 or 1
-    const int64_t il  = tid%16;  // 0...15
-    const int64_t im  = il/8;    // 0...1
-    const int64_t in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
 }

-#if QK_K == 256
 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
    if (j < 4) {
        d = q[j] & 63; m = q[j + 4] & 63;
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
 }
-#endif

 template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t

    const int64_t i = blockIdx.x;

-#if QK_K == 256
    // assume 32 threads
    const int64_t tid = threadIdx.x;
    const int64_t il  = tid/8;
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
        y[l + 0] = d1 * (q[l] & 0xF) - m1;
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
-#else
-    const int64_t tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
 }

 template<typename dst_t>
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t

    const int64_t i = blockIdx.x;

-#if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int64_t tid = threadIdx.x;
    const int64_t il  = tid/16;   // il is in 0...3
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    hm <<= 1;
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int64_t tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int64_t im = tid/8;  // 0...3
-    const int64_t in = tid%8;  // 0...7
-    const int64_t is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
 }

 template<typename dst_t>
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    const block_q6_K * x = (const block_q6_K *) vx;

    const int64_t i = blockIdx.x;
-#if QK_K == 256

    // assume 64 threads - this is very slightly better than the one below
    const int64_t tid = threadIdx.x;
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t ip  = tid/16;         // 0 or 1
-    const int64_t il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
 }

 template<typename dst_t>
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const block_iq2_s * x = (const block_iq2_s *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
    const block_iq3_s * x = (const block_iq3_s *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    const block_iq1_s * x = (const block_iq1_s  *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

 template<typename dst_t>
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    const block_iq1_m * x = (const block_iq1_m  *) vx;

    const int64_t tid = threadIdx.x;
-#if QK_K == 256
    const int64_t il = tid/8; // 0...3
    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
-#else
-    NO_DEVICE_CODE;
-#endif
-
 }

-
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
-
 }

-#if QK_K != 64
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const int64_t i   = blockIdx.x;
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
 }
-#endif

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
 template<typename dst_t>
 static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
 }

 template<typename dst_t>
 static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
 }

 template<typename dst_t>
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
 template<typename dst_t>
 static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
 }

 template<typename dst_t>
 static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
 }

 template<typename dst_t>
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
 template<typename dst_t>
 static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
-#if QK_K == 64
-    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
-#else
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
-#endif
 }

 template <typename src_t, typename dst_t>
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
        tmp += dall * sum1 - dmin * sum2;

    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
-
    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
        tmp += d * sum;

    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,

    const block_q4_K * x = (const block_q4_K *)vx + ib0;

-#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
 #endif

    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif

    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
    }

-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,

    const block_q6_K * x = (const block_q6_K *)vx + ib0;

-#if QK_K == 256
-
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1

@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,

    }

-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);

@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;

-#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
    }

 #pragma unroll
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;

-#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
    }

 #pragma unroll
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

-#ifndef GGML_QKK_64
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    int    v[2];
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    }

    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
 }

 static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

-#ifndef GGML_QKK_64
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    int   vl[2];
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    }

    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
 }

 static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(

 static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

 #if QR2_XXS == 8
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    }
    return d * (sumi1 + sumi2);
 #endif
-#else
-    NO_DEVICE_CODE;
-#endif
 }

 static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

    const int ib32 = iqs;
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    GGML_UNUSED(ksigns64);
    NO_DEVICE_CODE;
 #endif
-#else
-    GGML_UNUSED(ksigns64);
-    NO_DEVICE_CODE;
-#endif
 }

 // TODO
 static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

    const int ib32 = iqs;
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
    GGML_UNUSED(ksigns64);
    NO_DEVICE_CODE;
 #endif
-#else
-    GGML_UNUSED(ksigns64);
-    NO_DEVICE_CODE;
-#endif
 }

 static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

    const int ib32 = iqs;
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 #else
    NO_DEVICE_CODE;
 #endif
-#else
-    NO_DEVICE_CODE;
-#endif
 }

 // TODO: don't use lookup table for signs
 static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

    const int ib32 = iqs;
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 #else
    NO_DEVICE_CODE;
 #endif
-#else
-    NO_DEVICE_CODE;
-#endif
 }

 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

    const int ib32 = iqs;
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
    const float d = d1q * __low2float (bq8_1[ib32].ds);
    const float m = d1q * __high2float(bq8_1[ib32].ds);
    return d * sumi + m * delta;
-#else
-    NO_DEVICE_CODE;
-#endif
 }

 static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;

    const int ib32 = iqs;
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
    const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
-#else
-    NO_DEVICE_CODE;
-#endif
 }

 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

-#if QK_K == 256
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-
    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
        sumi2 = __dp4a(v2, q8[j+4], sumi2);
    }
    return d * (sumi1 + sumi2);
-
-#else
-    NO_DEVICE_CODE;
-#endif
 #else
    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
 #endif
@@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                // dictionary of preprocessor macros
                NSMutableDictionary * prep = [NSMutableDictionary dictionary];

-#ifdef GGML_QKK_64
-                prep[@"GGML_QKK_64"] = @(1);
-#endif
-
                MTLCompileOptions* options = [MTLCompileOptions new];
                options.preprocessorMacros = prep;

@@ -1773,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
                            }
                            else if (src0t == GGML_TYPE_Q5_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
-                                    #if QK_K == 64
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
-                                    #else
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
-                                    #endif
-
                                    } break;
                                default:
                                    {
@@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
                            }
                            else if (src0t == GGML_TYPE_Q5_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -3386,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(

    const int step = sizeof(block_q2_K) * nb;

-#if QK_K == 256
    const int ix = tiisg/8;  // 0...3
    const int it = tiisg%8;  // 0...7
    const int iq = it/4;     // 0 or 1
@@ -3438,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(

        y4 += 4 * QK_K;
    }
-#else
-    const int ix = tiisg/2;  // 0...15
-    const int it = tiisg%2;  // 0...1
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    for (int ib = ix; ib < nb; ib += 16) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
-        }
-
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
-
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-
-        y4 += 16 * QK_K;
-    }
-#endif

    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
@@ -3526,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }

-#if QK_K == 256
 void kernel_mul_mv_q3_K_f32_impl(
        device const  void * src0,
        device const float * src1,
@@ -3685,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
        }
    }
 }
-#else
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const int row = 2 * r0 + sgitg;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    const int ix = tiisg/4;
-    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    float2 sum = {0.f, 0.f};
-
-    for (int i = ix; i < nb; i += 8) {
-
-        const float d_all = (float)(x[i].d);
-
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
-        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
-        device const float    * y = yy + i * QK_K + il;
-
-        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
-        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
-        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
-        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
-
-        for (int l = 0; l < 4; l += 2) {
-            const uint16_t hm = h[l/2] >> iq;
-            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
-                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
-                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
-                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
-            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
-                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
-                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
-                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
-        }
-
-    }
-    const float sumf = sum[0] + sum[1] * 1.f/256.f;
-
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-
-}
-#endif

 [[host_name("kernel_mul_mv_q3_K_f32")]]
 kernel void kernel_mul_mv_q3_K_f32(
@@ -3792,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
 }

-#if QK_K == 256
 void kernel_mul_mv_q4_K_f32_impl(
        device const  void * src0,
        device const float * src1,
@@ -3906,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
        }
    }
 }
-#else
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int ix = tiisg/4;  // 0...7
-    const int it = tiisg%4;  // 0...3
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[8];
-    float yh[8];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q4_K) * nb / 2;
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    uint16_t sc16[4];
-
-    for (int ib = ix; ib < nb; ib += 8) {
-
-        float2 sumy = {0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
-            yh[i] = y4[i+32]; sumy[1] += yh[i];
-        }
-
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            sc16[0] = sc[0] & 0x000f;
-            sc16[1] = sc[0] & 0x0f00;
-            sc16[2] = sc[0] & 0x00f0;
-            sc16[3] = sc[0] & 0xf000;
-
-            float2 acc1 = {0.f, 0.f};
-            float2 acc2 = {0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
-                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
-                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
-                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
-
-            qs += step;
-            sc += step;
-            dh += step;
-        }
-
-        y4 += 8 * QK_K;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#endif

 [[host_name("kernel_mul_mv_q4_K_f32")]]
 kernel void kernel_mul_mv_q4_K_f32(
@@ -4070,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(

    const int step = sizeof(block_q5_K) * nb;

-#if QK_K == 256
-#
    float yl[16], yh[16];

    const uint16_t kmask1 = 0x3f3f;
@@ -4154,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
        y1 += 4 * QK_K;

    }
-#else
-    float yl[8], yh[8];
-
-    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
-    const int ix = tiisg%8;
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    device const float * y = yy + ix*QK_K + il;
-
-    for (int i = ix; i < nb; i += 8) {
-
-        for (int l = 0; l < 4; ++l) {
-            yl[l+0] = y[l+ 0];
-            yl[l+4] = y[l+16];
-            yh[l+0] = y[l+32];
-            yh[l+4] = y[l+48];
-        }
-
-        device const half * dh = &x[i].d;
-        device const uint8_t * q = x[i].qs + il;
-        device const uint8_t * h = x[i].qh + in;
-        device const int8_t  * s = x[i].scales;
-
-        for (int row = 0; row < 2; ++row) {
-
-            const float d = dh[0];
-
-            float2 acc = {0.f, 0.f};
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t hl = h[l] >> iq;
-                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
-                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
-                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
-                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
-            }
-            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
-
-            q += step;
-            h += step;
-            s += step;
-            dh += step/2;
-
-        }
-
-        y += 8 * QK_K;
-    }
-#endif

    for (int row = 0; row < 2; ++row) {
        const float tot = simd_sum(sumf[row]);
@@ -4280,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(

    float sumf = 0;

-#if QK_K == 256
    const int tid  = tiisg/2;
    const int ix   = tiisg%2;
    const int ip   = tid/8;         // 0 or 1
@@ -4316,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(

    }

-#else
-    const int ix  = tiisg/4;
-    const int il  = 4*(tiisg%4);
-
-    for (int i = ix; i < nb; i += 8) {
-        device const float * y = yy + i * QK_K + il;
-        device const uint8_t * ql = x[i].ql + il;
-        device const uint8_t * qh = x[i].qh + il;
-        device const int8_t  * s  = x[i].scales;
-
-        const float d = x[i].d;
-
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 4; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
-            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
-    }
-
-#endif
-
    const float tot = simd_sum(sumf);
    if (tiisg == 0) {
        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
@@ -5173,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(

    device const float * y4 = y + 32 * ix;

-#if QK_K != 64
    iq1m_scale_t scale;
-#endif

    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {

@@ -5196,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
        device const uint16_t * sc = (device const uint16_t *)xr->scales;

        for (int row = 0; row < N_DST; row++) {
-
-#if QK_K != 64
            scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-#endif

            constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
            constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
@@ -5215,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
            }
            const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
            const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
-#if QK_K == 64
-            const float d = (float) *((device const half *)(sc - 1));
-            sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
-                              (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
-#else
+
            sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
                                             (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
-#endif

            sc += nb*sizeof(block_iq1_m)/2;
            qs += nb*sizeof(block_iq1_m);
@@ -5334,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
    }
 }

-#if QK_K != 64
 void kernel_mul_mv_iq4_xs_f32_impl(
        device const  void * src0,
        device const float * src1,
@@ -5429,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
        }
    }
 }
-#endif

 [[host_name("kernel_mul_mv_iq1_s_f32")]]
 kernel void kernel_mul_mv_iq1_s_f32(
@@ -5542,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {

-#if QK_K == 64
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#else
    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#endif
 }

 //============================= templates and their specializations =============================
@@ -5672,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
    float dl, ml;
    uint8_t sc = xb->scales[il];

-#if QK_K == 256
    q = q + 32*(il/8) + 16*(il&1);
    il = (il/2)%4;
-#endif
+
    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
@@ -5691,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
    device const uint8_t * h = (device const uint8_t *)xb->hmask;
    device const int8_t * scales = (device const int8_t *)xb->scales;

-#if QK_K == 256
    q = q + 32 * (il/8) + 16 * (il&1);
    h = h + 16 * (il&1);
    uint8_t m = 1 << (il/2);
@@ -5712,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
    for (int i = 0; i < 16; ++i) {
        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
    }
-#else
-    float    kcoef = il&1 ? 1.f/16.f : 1.f;
-    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
-    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
-    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    uint8_t  m = 1<<(il*2);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
-    }
-#endif
 }

 static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
@@ -5734,7 +5401,6 @@ template <typename type4x4>
 void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
    device const uchar * q = xb->qs;

-#if QK_K == 256
    short is = (il/4) * 2;
    q = q + (il/4) * 32 + 16 * (il&1);
    il = il & 3;
@@ -5743,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
    const float min = xb->dmin;
    const float dl = d * sc[0];
    const float ml = min * sc[1];
-#else
-    (void) get_scale_min_k4_just2;

-    q = q + 16 * (il&1);
-    device const uint8_t * s = xb->scales;
-    device const half2 * dh = (device const half2 *)xb->d;
-    const float2 d = (float2)dh[0];
-    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
-    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
-#endif
    const ushort mask = il<2 ? 0x0F : 0xF0;
    for (int i = 0; i < 16; ++i) {
        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
@@ -5764,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
    device const uint8_t * q  = xb->qs;
    device const uint8_t * qh = xb->qh;

-#if QK_K == 256
    short is = (il/4) * 2;
    q  = q + 32 * (il/4) + 16 * (il&1);
    qh = qh + 16 * (il&1);
@@ -5781,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
    for (int i = 0; i < 16; ++i) {
        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
    }
-#else
-    q = q + 16 * (il&1);
-    device const int8_t * s = xb->scales;
-    const float dl = xb->d * s[il];
-    uint8_t m = 1<<(il*2);
-    const float  coef = il<2 ? 1.f  : 1.f/16.f;
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
-    }
-#endif
 }

 template <typename type4x4>
@@ -5801,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
    device const uint8_t * qh = (device const uint8_t *)xb->qh;
    device const int8_t * scales = (device const int8_t *)xb->scales;

-#if QK_K == 256
    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
    qh = qh + 32*(il/8) + 16*(il&1);
    float sc = scales[(il%2) + 2 * ((il/2))];
    il = (il/2) & 3;
-#else
-    ql = ql + 16 * (il&1);
-    float sc = scales[il];
-#endif
+
    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
    const float       coef = il>1 ? 1.f/16.f          : 1.f;
@@ -5966,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
    const int ib32 = il/2;
    il = il%2;
    device const uint16_t * sc = (device const uint16_t *)xb->scales;
-#if QK_K == 64
-    const float d = xb->d;
-#else
+
    iq1m_scale_t scale;
    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
    const float d = scale.f16;
-#endif
+
    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
    device const uint8_t * qh = xb->qh + 2*ib32 + il;
-#if QK_K == 64
-    const float dl  = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
-#else
+
    const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
-#endif
    const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
    const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
@@ -6009,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4

 template <typename type4x4>
 void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
-#if QK_K == 64
-    dequantize_iq4_nl(xb, il, reg);
-#else
    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
    const int ib32 = il/2;
    il = il%2;
@@ -6028,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
    }
-#endif
 }

 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
@@ -6533,11 +6165,7 @@ kernel void kernel_mul_mm_id(
        sgitg);
 }

-#if QK_K == 256
 #define QK_NL 16
-#else
-#define QK_NL 4
-#endif

 //
 // get rows
@@ -6577,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_r
 template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_get_rows_iq1_m")]]   kernel get_rows_t kernel_get_rows<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif

 //
 // matrix-matrix multiplication
@@ -6609,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_m
 template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif

 //
 // indirect matrix-matrix multiplication
@@ -6641,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel
 template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
 template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif

 //
 // matrix-vector multiplication
@@ -6854,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
 template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
 template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
-#if QK_K != 64
 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
-#endif

@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml.h"
 #include "ggml-opencl.h"
 #include "ggml-backend-impl.h"

@@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
    const block_q2_K * x = (const block_q2_K *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;
@@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    float dall = x[i].dm[0];
-    float dmin = x[i].dm[1];
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
 }

 template<typename dst_t>
@@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
    const int i = item_ct1.get_group(2);
    const block_q3_K * x = (const block_q3_K *) vx;

-#if QK_K == 256
    const int r = item_ct1.get_local_id(2) / 4;
    const int tid = r/2;
    const int is0 = r%2;
@@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
    const uint8_t * hm = x[i].hmask;

    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = item_ct1.get_local_id(2);
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
 }

-#if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
    if (j < 4) {
        d = q[j] & 63; m = q[j + 4] & 63;
@@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
    }
 }
-#endif

 template<typename dst_t>
 static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri

    const int i = item_ct1.get_group(2);

-#if QK_K == 256
    // assume 32 threads
    const int tid = item_ct1.get_local_id(2);
    const int il  = tid/8;
@@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
        y[l + 0] = d1 * (q[l] & 0xF) - m1;
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
-#else
-    const int tid = item_ct1.get_local_id(2);
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
 }

 template<typename dst_t>
@@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri

    const int i = item_ct1.get_group(2);

-#if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
    const int tid = item_ct1.get_local_id(2);
    const int il  = tid/16;   // il is in 0...3
@@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
    hm <<= 1;
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = item_ct1.get_local_id(2);
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
 }

 template<typename dst_t>
@@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
    const block_q6_K * x = (const block_q6_K *) vx;

    const int i = item_ct1.get_group(2);
-#if QK_K == 256

    // assume 64 threads - this is very slightly better than the one below
    const int tid = item_ct1.get_local_id(2);
@@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = item_ct1.get_local_id(2);
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
 }

 template<typename dst_t>
@@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
 }

 template<typename dst_t>
@@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
 }

 template <typename dst_t>
@@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
    const block_iq2_s * x = (const block_iq2_s *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
 #pragma unroll
-    for (int j = 0; j < 8; ++j)
+    for (int j = 0; j < 8; ++j) {
        y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-
-#endif
-
+    }
 }

 template<typename dst_t>
@@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
-#else
-    assert(false);
-#endif
-
 }

 template <typename dst_t>
@@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
    const block_iq3_s * x = (const block_iq3_s *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
-#else
-    assert(false);
-#endif
-
 }

 template <typename dst_t>
@@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
    const block_iq1_s * x = (const block_iq1_s  *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
-#else
-    assert(false);
-#endif
-
 }

 template <typename dst_t>
@@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
    const block_iq1_m * x = (const block_iq1_m  *) vx;

    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
    for (int j = 0; j < 8; ++j) {
        y[j] = d * (q[j] + delta);
    }
-#else
-    assert(false);
-#endif
-
 }

 template <typename dst_t>
@@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
    const int tid =
        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
    const int ix =
@@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
        tmp += dall * sum1 - dmin * sum2;

    }
-#else
-    const int tid = item_ct1.get_local_id(2) /
-                    (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
-    const int ix = item_ct1.get_local_id(2) %
-                   (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const sycl::float2 dall =
-            x[i].dm.convert<float, sycl::rounding_mode::automatic>();
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x() * sum1 - dall.y() * sum2;
-    }
-
-#endif

    // sum up partial sums and write back result
 #pragma unroll
@@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
-
    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

@@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
        tmp += d * sum;

    }
-#else
-
-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif

    // sum up partial sums and write back result
 #pragma unroll
@@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,

    const block_q4_K * x = (const block_q4_K *)vx + ib0;

-#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
 #endif

    }
-#else
-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif

    // sum up partial sums and write back result
 #pragma unroll
@@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,

    float tmp = 0; // partial sum for thread in warp

-#if QK_K == 256
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
@@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
               dmin * smin;
    }

-#else
-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
    // sum up partial sums and write back result
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa

    const block_q6_K * x = (const block_q6_K *)vx + ib0;

-#if QK_K == 256
-
    const int tid =
        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
    const int ix =
@@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa

    }

-#else
-
-    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
    // sum up partial sums and write back result
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6857,7 +6586,6 @@ static __dpct_inline__ float
 vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {

-#ifndef GGML_QKK_64
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

    int    v[2];
@@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
    }

    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = bq8_1[0].ds[0];
-    const float d8_2 = bq8_1[1].ds[1];
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
-    const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
 }

 template <int mmq_y>
@@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,

        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;

-#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
    }

 #pragma unroll
@@ -7050,7 +6728,6 @@ static __dpct_inline__ float
 vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {

-#ifndef GGML_QKK_64
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

    int   vl[2];
@@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
    }

    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = bq8_1[0].ds[0];
-    const float d8_2 = bq8_1[1].ds[1];
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
 }

 template <int mmq_y>
@@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,

        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;

-#if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
    }

 #pragma unroll
@@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
                     const block_q8_1 *__restrict__ bq8_1, const int &iqs,
                     const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
                     const uint8_t *kmask_iq2xs) {
-#if QK_K == 256
    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

 #if QR2_XXS == 8
@@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
    }
    return d * (sumi1 + sumi2);
 #endif
-#else
-    assert(false);
-    return 0.f;
-#endif
 }

 static __dpct_inline__ float
@@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
                    const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

    const int ib32 = iqs;
@@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
    assert(false);
    return 0.f;
 #endif
-#else
-    assert(false);
-    return 0.f;
-#endif
 }

 static __dpct_inline__ float
 vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#if QK_K == 256
    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

    const int ib32 = iqs;
@@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
    }
    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
-    assert(false);
-#endif
 }

 static __dpct_inline__ float
@@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
                     const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

    const int ib32 = iqs;
@@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
    assert(false);
    return 0.f;
 #endif
-#else
-    assert(false);
-    return 0.f;
-#endif
 }

 static __dpct_inline__ float
 vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
                   const uint32_t *iq3s_grid) {
-#if QK_K == 256
    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

    const int ib32 = iqs;
@@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
        (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
        bq8_1[ib32].ds[0];
    return d * sumi;
-#else
-    assert(false);
-#endif
 }

 static __dpct_inline__ float
 vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
                   const uint32_t *iq1s_grid_gpu) {
-#if QK_K == 256
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

    const int ib32 = iqs;
@@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
    const float d = d1q * bq8_1[ib32].ds[0];
    const float m = d1q * bq8_1[ib32].ds[1];
    return d * sumi + m * delta;
-#else
-    assert(false);
-#endif
 }

 static __dpct_inline__ float
 vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-#if QK_K == 256
    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;

    const int ib32 = iqs;
@@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
    const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
-#else
-    assert(false);
-#endif
 }

 static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7720,7 +7322,6 @@ static __dpct_inline__ float
 vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {

-#if QK_K == 256
    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

@@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
        sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
    }
    return d * (sumi1 + sumi2);
-#else
-    assert(false);
-#endif
 }

 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -10203,7 +9801,6 @@ template <typename dst_t>
 static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
                                     dpct::queue_ptr stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
@@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
                                 dequantize_block_q2_K(vx, y, item_ct1);
                             });
    }
-#else
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
-    }
-
-#endif
 }

 template <typename dst_t>
 static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
                                     dpct::queue_ptr stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
@@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
                                 dequantize_block_q3_K(vx, y, item_ct1);
                             });
    }
-#else
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
-    }
-#endif
 }

 template <typename dst_t>
@@ -10320,7 +9889,6 @@ template <typename dst_t>
 static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
                                     dpct::queue_ptr stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
@@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
                                 dequantize_block_q5_K(vx, y, item_ct1);
                             });
    }
-#else
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
-    }
-
-#endif
 }

 template <typename dst_t>
 static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
                                     dpct::queue_ptr stream) {
    const int nb = k / QK_K;
-#if QK_K == 256
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
@@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
                                 dequantize_block_q6_K(vx, y, item_ct1);
                             });
    }
-#else
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
-    }
-
-#endif
 }

 template <typename dst_t>
@@ -10529,9 +10068,6 @@ template <typename dst_t>
 static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
                                       dpct::queue_ptr stream) {
    const int nb = (k + QK_K - 1) / QK_K;
-#if QK_K == 64
-    dequantize_row_iq4_nl_sycl(vx, y, k, stream);
-#else
      {
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});
@@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
                      });
            });
      }
-#endif
 }


@@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                                        const int nrows_y, const int nrows_dst,
                                        dpct::queue_ptr stream) try {

-#if QK_K == 256
-
    int id;
    SYCL_CHECK(
        CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
            });
        }
    }
-#endif
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    },
    [GGML_TYPE_IQ4_XS] = {
        .type_name                = "iq4_xs",
-#if QK_K == 64
-        .blck_size                = QK4_NL,
-#else
        .blck_size                = QK_K,
-#endif
        .type_size                = sizeof(block_iq4_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
        .from_float               = quantize_row_iq4_xs,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
-#if QK_K == 64
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#else
        .vec_dot_type             = GGML_TYPE_Q8_K,
-#endif
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_K] = {
@@ -2678,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "ARGSORT",
    "LEAKY_RELU",

-    "FLASH_ATTN",
    "FLASH_ATTN_EXT",
-    "FLASH_FF",
    "FLASH_ATTN_BACK",
    "SSM_CONV",
    "SSM_SCAN",
@@ -2706,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };

-static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -2768,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "argsort(x)",
    "leaky_relu(x)",

-    "flash_attn(x)",
    "flash_attn_ext(x)",
-    "flash_ff(x)",
    "flash_attn_back(x)",
    "ssm_conv(x)",
    "ssm_scan(x)",
@@ -2796,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };

-static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@@ -6956,38 +6944,6 @@ struct ggml_tensor * ggml_top_k(
    return result;
 }

-// ggml_flash_attn
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
-
-    int32_t t = masked ? 1 : 0;
-    ggml_set_op_params(result, &t, sizeof(t));
-
-    result->op   = GGML_OP_FLASH_ATTN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = q;
-    result->src[1] = k;
-    result->src[2] = v;
-
-    return result;
-}
-
 // ggml_flash_attn_ext

 struct ggml_tensor * ggml_flash_attn_ext(
@@ -7047,38 +7003,6 @@ void ggml_flash_attn_ext_set_prec(
    ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
 }

-// ggml_flash_ff
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1) {
-    GGML_ASSERT(ggml_can_mul_mat(b0, a));
-    // TODO: more checks
-
-    bool is_node = false;
-
-    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
-
-    result->op   = GGML_OP_FLASH_FF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b0;
-    result->src[2] = b1;
-    result->src[3] = c0;
-    result->src[4] = c1;
-
-    return result;
-}
-
 // ggml_flash_attn_back

 struct ggml_tensor * ggml_flash_attn_back(
@@ -7088,6 +7012,8 @@ struct ggml_tensor * ggml_flash_attn_back(
        struct ggml_tensor  * v,
        struct ggml_tensor  * d,
        bool                  masked) {
+    GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
+
    GGML_ASSERT(ggml_can_mul_mat(k, q));
    // TODO: check if vT can be multiplied by (k*qT)

@@ -15717,400 +15643,6 @@ static void ggml_compute_forward_argsort(
    }
 }

-// ggml_compute_forward_flash_attn
-
-static void ggml_compute_forward_flash_attn_f32(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-        for (int64_t ic = 0; ic < masked_begin; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2 % nek2;
-            const int ik1 = ic;
-
-            // S indices
-            const int i1 = ik1;
-
-            ggml_vec_dot_f32(neq0,
-                    S + i1, 0,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-        }
-
-        // scale
-        ggml_vec_scale_f32(masked_begin, S, scale);
-
-        for (int64_t i = masked_begin; i < M; i++) {
-            S[i] = -INFINITY;
-        }
-
-        // softmax
-        // exclude known -INF S[..] values from max and loop
-        // dont forget to set their SW values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(masked_begin, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                sum = ggml_vec_soft_max_f32(Mup, S, S, max);
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(masked_begin, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < masked_begin; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        for (int64_t ic = 0; ic < nev1; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            // v indices
-            const int iv2 = iq2 % nev2;
-            const int iv3 = iq3;
-
-            ggml_vec_dot_f32(masked_begin,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                    S, 0, 1);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_f16(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16(neq0,
-                        S + i1, 0,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16_unroll(neq0, nbk1,
-                        S + i1,
-                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
-        // dont forget to set their S values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                sum = ggml_vec_soft_max_f32(Mup, S, S, max);
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
-        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-            for (int64_t ic = 0; ic < nev1; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16(nev0,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                        S16, 0, 1);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16_unroll(nev0, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
-                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-                        S16);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-
-    switch (q->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_attn_f16(params, masked, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_f32(params, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_flash_attn_ext

 static void ggml_compute_forward_flash_attn_ext_f16(
@@ -16344,165 +15876,6 @@ static void ggml_compute_forward_flash_attn_ext(
    }
 }

-// ggml_compute_forward_flash_ff
-
-static void ggml_compute_forward_flash_ff_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];  // F16
-    const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
-    const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
-    const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
-    const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
-    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = nea0;
-    //const int64_t N = nea1;
-    const int64_t M = neb01;
-
-    GGML_ASSERT(ne0 == nea0);
-    GGML_ASSERT(ne1 == nea1);
-    GGML_ASSERT(ne2 == nea2);
-
-    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb10 == sizeof(float));
-    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbc10 == sizeof(float));
-
-    GGML_ASSERT(neb00 == D);
-    GGML_ASSERT(neb01 == M);
-    GGML_ASSERT(neb10 == M);
-    GGML_ASSERT(neb11 == 1);
-
-    GGML_ASSERT(nec00 == M);
-    GGML_ASSERT(nec01 == D);
-    GGML_ASSERT(nec10 == D);
-    GGML_ASSERT(nec11 == 1);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by a rows using ggml_vec_dot_f32
-
-    // total rows in a
-    const int nr = nea1*nea2*nea3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // a indices
-        const int ia3 = ir/(nea2*nea1);
-        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
-        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
-
-        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-
-        for (int64_t ic = 0; ic < neb01; ++ic) {
-            // b0 indices
-            const int ib03 = ia3;
-            const int ib02 = ia2;
-            const int ib01 = ic;
-
-            // S indices
-            const int i1 = ib01;
-
-            ggml_vec_dot_f16(nea0,
-                    S + i1, 0,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)), 0, 1);
-        }
-
-        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
-        //ggml_vec_gelu_f32(neb01, S, S);
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        ggml_vec_gelu_f16(neb01, S16, S16);
-
-        {
-            // dst indices
-            const int i1 = ia1;
-            const int i2 = ia2;
-            const int i3 = ia3;
-
-            for (int64_t ic = 0; ic < nec01; ++ic) {
-
-                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
-                        S16, 0, 1);
-            }
-
-            ggml_vec_add_f32(nec01,
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) c1->data);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_ff(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * b0 = dst->src[1];
-
-    switch (b0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_ff_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false); // TODO
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_flash_attn_back

 static void ggml_compute_forward_flash_attn_back_f32(
@@ -18073,21 +17446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_leaky_relu(params, tensor);
            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                const int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                const bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, masked, tensor);
-            } break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                ggml_compute_forward_flash_ff(params, tensor);
-            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -19094,7 +18456,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
-        case GGML_OP_FLASH_ATTN:
        case GGML_OP_FLASH_ATTN_EXT:
            {
                struct ggml_tensor * flash_grad = NULL;
@@ -19148,10 +18509,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                            zero_table);
                }
            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                GGML_ASSERT(false); // not supported
@@ -19838,15 +19195,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
            {
                n_tasks = n_threads;
            } break;
-        case GGML_OP_FLASH_ATTN:
        case GGML_OP_FLASH_ATTN_EXT:
            {
                n_tasks = n_threads;
            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                n_tasks = n_threads;
-            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                n_tasks = n_threads;
@@ -20243,40 +19595,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
                } break;
-            case GGML_OP_FLASH_ATTN:
-                {
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_BF16) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    }
-                } break;
            case GGML_OP_FLASH_ATTN_EXT:
                {
                    const int64_t ne00 = node->src[0]->ne[0]; // D

                    cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
                } break;
-            case GGML_OP_FLASH_FF:
-                {
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_BF16) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    }
-                } break;
            case GGML_OP_FLASH_ATTN_BACK:
                {
                    const int64_t    D = node->src[0]->ne[0];
@@ -22117,11 +21441,7 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-#if QK_K == 64
-        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-#else
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-#endif
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
@@ -481,9 +481,7 @@ extern "C" {
        GGML_OP_ARGSORT,
        GGML_OP_LEAKY_RELU,

-        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_ATTN_EXT,
-        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_SSM_CONV,
        GGML_OP_SSM_SCAN,
@@ -1761,13 +1759,6 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   k);

-    GGML_API struct ggml_tensor * ggml_flash_attn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            bool                  masked);
-
 #define GGML_KQ_MASK_PAD 32

    // q:    [n_embd, n_batch,     n_head,    1]
@@ -1788,6 +1779,7 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);

+    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
           struct ggml_tensor  * q,
@@ -1796,14 +1788,6 @@ extern "C" {
           struct ggml_tensor  * d,
           bool                  masked);

-    GGML_API struct ggml_tensor * ggml_flash_ff(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b0,
-            struct ggml_tensor  * b1,
-            struct ggml_tensor  * c0,
-            struct ggml_tensor  * c1);
-
    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
            struct ggml_tensor  * s,
@@ -905,9 +905,8 @@ class GGUFValueType(IntEnum):
            raise ValueError(f"Unknown type: {type(val)}")


-# Note: Does not support GGML_QKK_64
-QK_K = 256
 # Items here are (block size, type size)
+QK_K = 256
 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.F32:     (1, 4),
    GGMLQuantizationType.F16:     (1, 2),
@@ -26,13 +26,9 @@
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
-#ifndef QK_K
-#  ifdef GGML_QKK_64
-#    define QK_K 64
-#  else
-#    define QK_K 256
-#  endif
-#endif
+
+// TODO: replace with ggml API call
+#define QK_K 256

 #ifdef __has_include
    #if __has_include(<unistd.h>)
@@ -1696,17 +1692,24 @@ static llama_state g_state;
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
+    MODEL_14M,
    MODEL_17M,
    MODEL_22M,
    MODEL_33M,
+    MODEL_70M,
    MODEL_109M,
    MODEL_137M,
+    MODEL_160M,
    MODEL_335M,
+    MODEL_410M,
    MODEL_0_5B,
    MODEL_1B,
+    MODEL_1_4B,
    MODEL_2B,
+    MODEL_2_8B,
    MODEL_3B,
    MODEL_4B,
+    MODEL_6_9B,
    MODEL_7B,
    MODEL_8B,
    MODEL_12B,
@@ -1738,6 +1741,7 @@ static const size_t GiB = 1024*MiB;
 struct llama_hparams {
    bool vocab_only;
    bool rope_finetuned;
+    bool use_par_res;

    uint32_t n_vocab;
    uint32_t n_ctx_train; // context size the model was trained on
@@ -2479,7 +2483,6 @@ static bool llama_kv_cache_init(
 static bool llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
        const struct llama_batch & batch) {
-    const uint32_t n_ctx    = cache.size;
    const uint32_t n_tokens = batch.n_tokens;

    if (cache.recurrent) {
@@ -2530,16 +2533,16 @@ static bool llama_kv_cache_find_slot(
    }
    // otherwise, one cell per token.

-    if (n_tokens > n_ctx) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
+    if (n_tokens > cache.size) {
+        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
        return false;
    }

    uint32_t n_tested = 0;

    while (true) {
-        if (cache.head + n_tokens > n_ctx) {
-            n_tested += n_ctx - cache.head;
+        if (cache.head + n_tokens > cache.size) {
+            n_tested += cache.size - cache.head;
            cache.head = 0;
            continue;
        }
@@ -2558,7 +2561,7 @@ static bool llama_kv_cache_find_slot(
            break;
        }

-        if (n_tested >= n_ctx) {
+        if (n_tested >= cache.size) {
            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
            return false;
        }
@@ -3778,17 +3781,24 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {

 static const char * llama_model_type_name(e_model type) {
    switch (type) {
+        case MODEL_14M:    return "14M";
        case MODEL_17M:    return "17M";
        case MODEL_22M:    return "22M";
        case MODEL_33M:    return "33M";
+        case MODEL_70M:    return "70M";
        case MODEL_109M:   return "109M";
        case MODEL_137M:   return "137M";
+        case MODEL_160M:   return "160M";
        case MODEL_335M:   return "335M";
+        case MODEL_410M:   return "410M";
        case MODEL_0_5B:   return "0.5B";
        case MODEL_1B:     return "1B";
+        case MODEL_1_4B:   return "1.4B";
        case MODEL_2B:     return "2B";
+        case MODEL_2_8B:   return "2.8B";
        case MODEL_3B:     return "3B";
        case MODEL_4B:     return "4B";
+        case MODEL_6_9B:   return "6.9B";
        case MODEL_7B:     return "7B";
        case MODEL_8B:     return "8B";
        case MODEL_12B:    return "12B";
@@ -4287,6 +4297,52 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_GPTNEOX:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
+                switch (hparams.n_layer) {
+                    case 6:
+                        switch (hparams.n_ff) {
+                            case 512: model.type = e_model::MODEL_14M; break;
+                            case 2048: model.type = e_model::MODEL_70M; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 12:
+                        switch (hparams.n_ff) {
+                            case 3072: model.type = e_model::MODEL_160M; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 16:
+                        switch (hparams.n_ff) {
+                            case 8192: model.type = e_model::MODEL_1B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 24:
+                        switch (hparams.n_ff) {
+                            case 4096: model.type = e_model::MODEL_410M; break;
+                            case 8192: model.type = e_model::MODEL_1_4B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 32:
+                        switch (hparams.n_ff) {
+                            case 10240: model.type = e_model::MODEL_2_8B; break;
+                            case 16384: model.type = e_model::MODEL_6_9B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 36:
+                        switch (hparams.n_ff) {
+                            case 20480: model.type = e_model::MODEL_12B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 44:
+                        switch (hparams.n_ff) {
+                            case 24576: model.type = e_model::MODEL_20B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
        default: (void)0;
    }

@@ -6038,6 +6094,41 @@ static bool llm_load_tensors(
                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                    }
                } break;
+            case LLM_ARCH_GPTNEOX:
+                {
+                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
+                    // output
+                    {
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
+
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
+
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@@ -6627,7 +6718,7 @@ static struct ggml_tensor * llm_build_kqv(

        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);

-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
            ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
        }

@@ -6636,7 +6727,7 @@ static struct ggml_tensor * llm_build_kqv(
        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
        cb(kq, "kq", il);

-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
            // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
            // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
            ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -10565,6 +10656,140 @@ struct llm_build_context {

        return gf;
    }
+
+    struct ggml_cgraph * build_gptneox() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // ffn
+            if (hparams.use_par_res) {
+                // attention and ffn are computed in parallel
+                // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+                struct ggml_tensor * attn_out = cur;
+
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, inpL);
+                cb(cur, "ffn_out", il);
+
+                inpL = ggml_add(ctx0, cur, attn_out);
+                cb(inpL, "l_out", il);
+            } else {
+                // attention and ffn are computed sequentially
+                // x = x + attn(ln1(x))
+                // x = x + ffn(ln2(x))
+
+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+                cb(ffn_inp, "ffn_inp", il);
+
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+                inpL = ggml_add(ctx0, cur, ffn_inp);
+                cb(inpL, "l_out", il);
+            }
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };

 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -10775,6 +11000,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_olmo();
            } break;
+        case LLM_ARCH_GPTNEOX:
+            {
+                result = llm.build_gptneox();
+            } break;
        default:
            GGML_ASSERT(false);
    }
@@ -14308,8 +14537,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
-        else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
-                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
        if (qs.model.type == MODEL_70B) {
            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
@@ -15769,7 +15996,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        // these models do not use RoPE
        case LLM_ARCH_GPT2:
        case LLM_ARCH_GPTJ:
-        case LLM_ARCH_GPTNEOX:
        case LLM_ARCH_MPT:
        case LLM_ARCH_REFACT:
        case LLM_ARCH_BLOOM:
@@ -15805,6 +16031,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_PHI3:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_STARCODER2:
+        case LLM_ARCH_GPTNEOX:
            return LLAMA_ROPE_TYPE_NEOX;

        // all model arches should be listed explicitly here
@@ -17183,6 +17410,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
    ctx->cparams.n_threads_batch = n_threads_batch;
 }

+uint32_t llama_n_threads(struct llama_context * ctx) {
+    return ctx->cparams.n_threads;
+}
+
+uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+    return ctx->cparams.n_threads_batch;
+}
+
 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
    ctx->abort_callback      = abort_callback;
    ctx->abort_callback_data = abort_callback_data;
@@ -17617,6 +17852,15 @@ static int32_t llama_chat_apply_template_internal(
            }
        }
        // llama2 templates seem to not care about "add_generation_prompt"
+    } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
+        // Phi 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
        // zephyr template
        for (auto message : chat) {
@@ -17749,15 +17993,6 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
-        // Phi 3
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
-        }
-        if (add_ass) {
-            ss << "<|assistant|>\n";
-        }
    } else {
        // template not supported
        return -1;
@@ -759,6 +759,12 @@ extern "C" {
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

+    // Get the number of threads used for generation of a single token.
+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+
+    // Get the number of threads used for prompt and batch processing (multiple token).
+    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+
    // Set whether to use causal attention or not
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -49,8 +49,14 @@ int main(void) {
        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
        // Llama-3
        "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
-        // Phi-3
-        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
+        //Phi-3-mini
+        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+        //Phi-3-small
+        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+        //Phi-3-medium
+        "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+        //Phi-3-vision
+        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@@ -79,8 +85,14 @@ int main(void) {
        "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
        // Llama 3
        "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-        // Phi 3
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-mini
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-small
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-medium
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-vision
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
@@ -1515,90 +1515,50 @@ int main(int argc, const char ** argv) {
        }

        // flash_attn f32
-        {
-            srand(seed);
-            const int nargs = 3;
+        // TODO: adapt to ggml_flash_attn_ext() changes
+        //{
+        //    srand(seed);
+        //    const int nargs = 3;

-            int64_t ne2[4];
+        //    int64_t ne2[4];

-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
+        //    get_random_dims(ne2, 4);
+        //    int64_t D = ne2[0];
+        //    int64_t N = ne2[1];
+        //    int64_t M = ne2[2] + N;
+        //    int64_t B = ne2[3];

-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int max_nrep = (ndims >= 3) ? 2 : 1;
-                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
-                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
-                        int64_t nek[4] = { D, M, B, ne[3] };
-                        int64_t nev[4] = { M, D, B, ne[3] };
-                        if (ndims == 2) {
-                            neq[2] = 1; neq[3] = 1;
-                            nek[2] = 1; nek[3] = 1;
-                            nev[2] = 1; nev[3] = 1;
-                        } else if (ndims == 3) {
-                            neq[3] = 1;
-                            nek[3] = 1;
-                            nev[3] = 1;
-                        }
-                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-                        ggml_set_param(ctx0, x[2]);
+        //    for (int masked = 0; masked <= 1; ++masked) {
+        //        for (int ndims = 2; ndims <= 4; ++ndims) {
+        //            int max_nrep = (ndims >= 3) ? 2 : 1;
+        //            for (int nrep = 1; nrep < max_nrep; ++nrep) {
+        //                int64_t neq[4] = { D, N, B*nrep, ne[3] };
+        //                int64_t nek[4] = { D, M, B, ne[3] };
+        //                int64_t nev[4] = { M, D, B, ne[3] };
+        //                if (ndims == 2) {
+        //                    neq[2] = 1; neq[3] = 1;
+        //                    nek[2] = 1; nek[3] = 1;
+        //                    nev[2] = 1; nev[3] = 1;
+        //                } else if (ndims == 3) {
+        //                    neq[3] = 1;
+        //                    nek[3] = 1;
+        //                    nev[3] = 1;
+        //                }
+        //                x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+        //                x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+        //                x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+        //                ggml_set_param(ctx0, x[0]);
+        //                ggml_set_param(ctx0, x[1]);
+        //                ggml_set_param(ctx0, x[2]);

-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+        //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

-                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
+        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+        //            }
+        //        }
+        //    }
+        //}

-        // flash_attn f16, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                }
-            }
-        }
        ggml_free(ctx0);
    }
Author	SHA1	Message	Date
Georgi Gerganov	74f33adf5f	readme : remove trailing space (#7469 )	2024-05-23 17:43:18 +03:00
Georgi Gerganov	1debe72737	ggml : silence UB sanitizer error during iq2_xxs quantization (#0 )	2024-05-23 17:25:38 +03:00
Tristan Druyen	007489e895	Fix phi3 chat template confusion with zephyr (#7449 ) * Fix phi3 template matching vs zephyr * Add regression test for new phi3 chat template * Implement review suggestions * Fix phi3 jinja test templates & match by <\|end\|> * Apply suggestion Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * Add all phi3 template variants in tests * Remove unneeded message trimming Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * Fix tests to not expect trimmed messages --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>	2024-05-23 16:15:15 +02:00
Raj Hammeer Singh Hada	8b94e799df	readme : add Bunny in supported models [no ci] (#7469 )	2024-05-23 15:30:13 +03:00
Daniel Bevenius	3015851c5a	llama : add getters for n_threads/n_threads_batch (#7464 ) * llama : add getters for n_threads/n_threads_batch This commit adds two new functions to the llama API. The functions can be used to get the number of threads used for generating a single token and the number of threads used for prompt and batch processing (multiple tokens). The motivation for this is that we want to be able to get the number of threads that the a context is using. The main use case is for a testing/verification that the number of threads is set correctly. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> * squash! llama : add getters for n_threads/n_threads_batch Rename the getters to llama_n_threads and llama_n_threads_batch. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> --------- Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-05-23 15:29:26 +03:00
Georgi Gerganov	55ac3b7aea	ci : use Pythia models instead of OpenLlama (#7470 ) * ci : start using Pythia models over OpenLlama ggml-ci * ci : disable q2_k ppl tests * ci : use convert-hf-to-gguf.py * ci : update gg_get_model * ci : fix convert outfile name ggml-ci * llama : gptneox arch use F32 attn prec ggml-ci	2024-05-23 15:28:14 +03:00
Victor Nogueira	dacfcebd60	readme : add GPT-NeoX + Pythia to the list of supported models (#7491 )	2024-05-23 15:12:43 +03:00
fairydreaming	9b82476ee9	Add missing inference support for GPTNeoXForCausalLM (Pythia and GPT-NeoX base models) (#7461 ) * convert-hf : add conversion of bloom-style qkv tensor to gpt-style qkv (code borrowed from BloomModel) * llama : add inference support for LLM_ARCH_GPTNEOX * llama : add model types for every Pythia variant and GPT-NeoX Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2024-05-23 11:49:53 +02:00
Georgi Gerganov	a61a94e543	llama : rename n_ctx -> cache.size, less confusing (#0 )	2024-05-23 12:38:18 +03:00
Brian	152da28ae5	labeler.yml: add embedding label detector [no ci] (#7482 )	2024-05-23 17:40:43 +10:00
Georgi Gerganov	d48c88cbd5	ggml : remove ggml_flash_attn and ggml_flash_ff (#7463 ) ggml-ci	2024-05-23 10:00:44 +03:00
Georgi Gerganov	e84b71c2c6	ggml : drop support for QK_K=64 (#7473 ) * ggml : drop support for QK_K=64 ggml-ci * opencl : restore QK_K=256 define	2024-05-23 10:00:21 +03:00