mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b11a93df41 | |||
| a33e6a0d2a | |||
| 47bb7b48c7 | |||
| c4d7f81786 | |||
| e849078c6e | |||
| 67fd33132f | |||
| 4804215cb8 | |||
| 8a533f0d90 | |||
| 269de86ba0 | |||
| c393733988 | |||
| e3965cf35a | |||
| 8b350356b2 | |||
| bf08e00643 | |||
| f7625019c5 | |||
| abbabc5e51 | |||
| f1a98c5254 | |||
| 7d548a1827 | |||
| 930b178026 | |||
| d52d7819b8 | |||
| 1289408817 |
@@ -669,8 +669,7 @@ jobs:
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
|
||||
@@ -6,11 +6,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- test/server-add-ci-test # FIXME remove
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
|
||||
jobs:
|
||||
server:
|
||||
@@ -18,45 +17,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx2'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||
image: ubuntu:latest
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
|
||||
image: ubuntu:latest
|
||||
experimental: true
|
||||
- build: 'cublas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
|
||||
image: nvidia/cuda:12.3.1-devel-ubuntu22.04
|
||||
arch_not_available: true # require nvidia docker engine
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
|
||||
image: ubuntu:latest
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build: 'vulkan'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
|
||||
image: ubuntu:latest
|
||||
arch_not_available: true
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
exclude:
|
||||
- build_type: Release
|
||||
sanitizer: ADDRESS
|
||||
- build_type: Release
|
||||
sanitizer: THREAD
|
||||
- build_type: Release
|
||||
sanitizer: UNDEFINED
|
||||
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
image: ubuntu:latest
|
||||
ports:
|
||||
- 8888
|
||||
options: --cpus 4
|
||||
@@ -72,40 +47,22 @@ jobs:
|
||||
apt-get update
|
||||
apt-get -y install \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
git \
|
||||
cmake \
|
||||
python3-pip \
|
||||
wget \
|
||||
psmisc
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
run: |
|
||||
apt install -y libclblast-dev
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
run: |
|
||||
apt-get -y install libopenblas-dev
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
run: |
|
||||
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
apt-get update
|
||||
apt-get -y install vulkan-sdk
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
|
||||
cmake .. \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
|
||||
- name: Tests dependencies
|
||||
@@ -121,7 +78,6 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_test
|
||||
continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
PORT=8888 ./tests.sh
|
||||
|
||||
+8
-2
@@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
|
||||
@@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
|
||||
@@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -155,6 +158,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Msty](https://msty.app) (proprietary)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
|
||||
---
|
||||
|
||||
|
||||
+1
-1
@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
// }
|
||||
//}
|
||||
|
||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -21,12 +21,8 @@ android {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
|
||||
@@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
@@ -146,10 +146,11 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= bd;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
@@ -179,10 +180,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
@@ -208,10 +211,12 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,14 +23,16 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
@@ -292,6 +294,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
|
||||
fprintf(stderr, "\n===============================================================================================\n");
|
||||
fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||
|
||||
@@ -1,8 +1,20 @@
|
||||
# llama.cpp/example/server
|
||||
# LLaMA.cpp HTTP Server
|
||||
|
||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
||||
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||
|
||||
Command line options:
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantum models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Monitoring endpoints
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
@@ -39,9 +51,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, default: enabled.
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
|
||||
|
||||
## Build
|
||||
|
||||
@@ -457,6 +472,18 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
]
|
||||
```
|
||||
|
||||
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
|
||||
|
||||
Available metrics:
|
||||
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
||||
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
|
||||
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
|
||||
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
|
||||
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
|
||||
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
|
||||
- `llamacpp:requests_processing`: Number of request processing.
|
||||
- `llamacpp:requests_deferred`: Number of request deferred.
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
+346
-62
@@ -43,9 +43,11 @@ struct server_params
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
bool slots_endpoint = true;
|
||||
bool metrics_endpoint = false;
|
||||
};
|
||||
|
||||
bool server_verbose = false;
|
||||
bool server_log_json = true;
|
||||
|
||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
||||
{
|
||||
@@ -301,12 +303,76 @@ struct llama_client_slot
|
||||
}
|
||||
|
||||
void print_timings() const {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
|
||||
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
|
||||
char buffer[512];
|
||||
double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
||||
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_prompt_processing, num_prompt_tokens_processed,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
t_token = t_token_generation / n_decoded;
|
||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_token_generation, n_decoded,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"n_decoded", n_decoded},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"t_total", t_prompt_processing + t_token_generation},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_metrics {
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
|
||||
void on_prompt_eval(const llama_client_slot &slot) {
|
||||
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
||||
|
||||
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
||||
t_prompt_processing += slot.t_prompt_processing;
|
||||
}
|
||||
|
||||
void on_prediction(const llama_client_slot &slot) {
|
||||
n_tokens_predicted_total += slot.n_decoded;
|
||||
|
||||
n_tokens_predicted += slot.n_decoded;
|
||||
t_tokens_generation += slot.t_token_generation;
|
||||
}
|
||||
|
||||
void reset_bucket() {
|
||||
n_prompt_tokens_processed = 0;
|
||||
t_prompt_processing = 0;
|
||||
n_tokens_predicted = 0;
|
||||
t_tokens_generation = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -344,6 +410,8 @@ struct llama_server_context
|
||||
llama_server_queue queue_tasks;
|
||||
llama_server_response queue_results;
|
||||
|
||||
llama_metrics metrics;
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
if (ctx)
|
||||
@@ -363,7 +431,7 @@ struct llama_server_context
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
multimodal = true;
|
||||
LOG_TEE("Multi Modal Mode Enabled");
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||
@@ -416,7 +484,7 @@ struct llama_server_context
|
||||
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
|
||||
LOG_TEE("Available slots:\n");
|
||||
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
||||
for (int i = 0; i < params.n_parallel; i++)
|
||||
{
|
||||
llama_client_slot slot;
|
||||
@@ -425,7 +493,10 @@ struct llama_server_context
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
LOG_INFO("new slot", {
|
||||
{"slot_id", slot.id},
|
||||
{"n_ctx_slot", slot.n_ctx}
|
||||
});
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
@@ -435,7 +506,12 @@ struct llama_server_context
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
|
||||
LOG_INFO("slot self-extend", {
|
||||
{"slot_id", slot.id},
|
||||
{"ga_n", ga_n},
|
||||
{"ga_w", ga_w}
|
||||
});
|
||||
}
|
||||
|
||||
slot.ga_i = 0;
|
||||
@@ -729,10 +805,16 @@ struct llama_server_context
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||
LOG_ERROR("failed to load image", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||
LOG_VERBOSE("image loaded", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
@@ -792,7 +874,10 @@ struct llama_server_context
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
||||
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
|
||||
LOG_INFO("slot is processing task", {
|
||||
{"slot_id", slot->id},
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -817,10 +902,24 @@ struct llama_server_context
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0)
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
@@ -1237,6 +1336,10 @@ struct llama_server_context
|
||||
split_multiprompt_task(task_id, task);
|
||||
}
|
||||
} else {
|
||||
// an empty prompt can make slot become buggy
|
||||
if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get<std::string>().empty()) {
|
||||
task.data["prompt"] = " "; // add a space so that we have one token
|
||||
}
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
}
|
||||
@@ -1355,7 +1458,7 @@ struct llama_server_context
|
||||
if (slot == nullptr)
|
||||
{
|
||||
// if no slot is available, we defer this task for processing later
|
||||
LOG_VERBOSE("no slot is available", {});
|
||||
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
break;
|
||||
}
|
||||
@@ -1404,7 +1507,7 @@ struct llama_server_context
|
||||
case TASK_TYPE_NEXT_RESPONSE: {
|
||||
// do nothing
|
||||
} break;
|
||||
case TASK_TYPE_SLOTS_DATA: {
|
||||
case TASK_TYPE_METRICS: {
|
||||
json slots_data = json::array();
|
||||
int n_idle_slots = 0;
|
||||
int n_processing_slots = 0;
|
||||
@@ -1431,17 +1534,41 @@ struct llama_server_context
|
||||
}
|
||||
slots_data.push_back(slot_data);
|
||||
}
|
||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||
LOG_INFO("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots}
|
||||
});
|
||||
LOG_VERBOSE("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots},
|
||||
{"slots", slots_data}
|
||||
});
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = true;
|
||||
res.error = false;
|
||||
res.result_json = {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "slots", slots_data }
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||
|
||||
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||
|
||||
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||
|
||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
metrics.reset_bucket();
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
}
|
||||
@@ -1469,7 +1596,7 @@ struct llama_server_context
|
||||
bool update_slots() {
|
||||
if (system_need_update)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
LOG_INFO("updating system prompt", {});
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
@@ -1479,12 +1606,13 @@ struct llama_server_context
|
||||
{
|
||||
if (system_prompt.empty() && clean_kv_cache)
|
||||
{
|
||||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
||||
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
||||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
@@ -1498,12 +1626,22 @@ struct llama_server_context
|
||||
{
|
||||
// Shift context
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
LOG_INFO("slot context shift", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_keep", n_keep},
|
||||
{"n_left", n_left},
|
||||
{"n_discard", n_discard},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||
});
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
@@ -1515,17 +1653,12 @@ struct llama_server_context
|
||||
slot.n_past -= n_discard;
|
||||
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
LOG_VERBOSE("decoding ongoing sequences", {});
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
@@ -1535,7 +1668,15 @@ struct llama_server_context
|
||||
slot.command = NONE;
|
||||
slot.t_last_used = ggml_time_us();
|
||||
|
||||
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
LOG_INFO("slot released", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()},
|
||||
{"truncated", slot.truncated}
|
||||
});
|
||||
queue_tasks.notify_slot_changed();
|
||||
|
||||
continue;
|
||||
@@ -1662,6 +1803,14 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
||||
{
|
||||
slot.n_past -= 1;
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||
|
||||
if (slot.ga_n != 1)
|
||||
@@ -1683,7 +1832,12 @@ struct llama_server_context
|
||||
slot.ga_i = ga_i;
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
LOG_INFO("slot progression", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "n_past", slot.n_past },
|
||||
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
||||
});
|
||||
}
|
||||
|
||||
slot.cache_tokens = prompt_tokens;
|
||||
@@ -1691,7 +1845,10 @@ struct llama_server_context
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id }
|
||||
});
|
||||
slot.n_past--;
|
||||
if (slot.ga_i > 0)
|
||||
{
|
||||
@@ -1699,9 +1856,13 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
LOG_INFO("kv cache rm [p0, end)", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "p0", p0 }
|
||||
});
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
@@ -1736,7 +1897,13 @@ struct llama_server_context
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_TEE("failed processing images\n");
|
||||
LOG_ERROR("failed processing images", {
|
||||
"slot_id", slot.id,
|
||||
"task_id", slot.task_id,
|
||||
});
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1778,9 +1945,9 @@ struct llama_server_context
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
@@ -1849,6 +2016,7 @@ struct llama_server_context
|
||||
{
|
||||
slot.t_start_genereration = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
@@ -1871,11 +2039,14 @@ struct llama_server_context
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("slots updated", {});
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1953,8 +2124,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: f16)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-format log output format: json or text (default: json)\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
@@ -2405,6 +2578,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.mmproj = argv[i];
|
||||
}
|
||||
else if (arg == "--log-format")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (std::strcmp(argv[i], "json") == 0)
|
||||
{
|
||||
server_log_json = true;
|
||||
}
|
||||
else if (std::strcmp(argv[i], "text") == 0)
|
||||
{
|
||||
server_log_json = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (arg == "--log-disable")
|
||||
{
|
||||
log_set_target(stdout);
|
||||
@@ -2414,6 +2608,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
{
|
||||
sparams.slots_endpoint = false;
|
||||
}
|
||||
else if (arg == "--metrics")
|
||||
{
|
||||
sparams.metrics_endpoint = true;
|
||||
}
|
||||
else if (arg == "--chat-template")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2514,32 +2712,40 @@ static json format_partial_response(
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{
|
||||
return json{
|
||||
{"tokens", tokens}};
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
}
|
||||
|
||||
static json format_detokenized_response(std::string content)
|
||||
{
|
||||
return json{
|
||||
{"content", content}};
|
||||
return json {
|
||||
{"content", content}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
||||
{
|
||||
// skip GH copilot requests when using default port
|
||||
if (req.path == "/v1/health" || req.path == "/v1/completions")
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_INFO("request", {
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
|
||||
LOG_VERBOSE("request", {
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
}
|
||||
|
||||
struct token_translator
|
||||
@@ -2621,7 +2827,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2668,7 +2874,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2683,6 +2889,87 @@ int main(int argc, char **argv)
|
||||
});
|
||||
}
|
||||
|
||||
if (sparams.metrics_endpoint) {
|
||||
svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
json data = result.result_json;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
|
||||
uint64_t t_prompt_processing = data["t_prompt_processing"];
|
||||
|
||||
uint64_t n_tokens_predicted = data["n_tokens_predicted"];
|
||||
uint64_t t_tokens_generation = data["t_tokens_generation"];
|
||||
|
||||
int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
|
||||
|
||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||
json all_metrics_def = json {
|
||||
{"counter", {{
|
||||
{"name", "prompt_tokens_total"},
|
||||
{"help", "Number of prompt tokens processed."},
|
||||
{"value", data["n_prompt_tokens_processed_total"]}
|
||||
}, {
|
||||
{"name", "tokens_predicted_total"},
|
||||
{"help", "Number of generation tokens processed."},
|
||||
{"value", data["n_tokens_predicted_total"]}
|
||||
}}},
|
||||
{"gauge", {{
|
||||
{"name", "prompt_tokens_seconds"},
|
||||
{"help", "Average prompt throughput in tokens/s."},
|
||||
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
|
||||
},{
|
||||
{"name", "predicted_tokens_seconds"},
|
||||
{"help", "Average generation throughput in tokens/s."},
|
||||
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
|
||||
},{
|
||||
{"name", "kv_cache_usage_ratio"},
|
||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||
{"value", 1. * kv_cache_used_cells / params.n_ctx}
|
||||
},{
|
||||
{"name", "kv_cache_tokens"},
|
||||
{"help", "KV-cache tokens."},
|
||||
{"value", data["kv_cache_tokens_count"]}
|
||||
},{
|
||||
{"name", "requests_processing"},
|
||||
{"help", "Number of request processing."},
|
||||
{"value", data["processing"]}
|
||||
},{
|
||||
{"name", "requests_deferred"},
|
||||
{"help", "Number of request deferred."},
|
||||
{"value", data["deferred"]}
|
||||
}}}
|
||||
};
|
||||
|
||||
std::stringstream prometheus;
|
||||
for (const auto& el : all_metrics_def.items()) {
|
||||
const auto& type = el.key();
|
||||
const auto& metrics_def = el.value();
|
||||
for (const auto& metric_def : metrics_def) {
|
||||
std::string name = metric_def["name"];
|
||||
std::string help = metric_def["help"];
|
||||
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
||||
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
|
||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||
@@ -2735,9 +3022,6 @@ int main(int argc, char **argv)
|
||||
// Set the base directory for serving static files
|
||||
svr.set_base_dir(sparams.public_path);
|
||||
|
||||
// to make it ctrl+clickable:
|
||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
std::unordered_map<std::string, std::string> log_data;
|
||||
log_data["hostname"] = sparams.hostname;
|
||||
log_data["port"] = std::to_string(sparams.port);
|
||||
|
||||
@@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables:
|
||||
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
|
||||
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
|
||||
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
|
||||
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
|
||||
|
||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ def before_scenario(context, scenario):
|
||||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
if context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
|
||||
@@ -13,6 +13,7 @@ Feature: llama.cpp server
|
||||
And 1 slots
|
||||
And embeddings extraction
|
||||
And 32 server max tokens to predict
|
||||
And prometheus compatible metrics exposed
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
@@ -25,11 +26,12 @@ Feature: llama.cpp server
|
||||
And <n_predict> max tokens to predict
|
||||
And a completion request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And prometheus metrics are exposed
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | read | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared)+ | 32 |
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a model <model>
|
||||
|
||||
@@ -13,6 +13,7 @@ import aiohttp
|
||||
import openai
|
||||
from behave import step
|
||||
from behave.api.async_step import async_run_until_complete
|
||||
from prometheus_client import parser
|
||||
|
||||
|
||||
@step(u"a server listening on {server_fqdn}:{server_port}")
|
||||
@@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
|
||||
context.server_api_key = None
|
||||
context.server_continuous_batching = False
|
||||
context.server_embeddings = False
|
||||
context.server_metrics = False
|
||||
context.server_process = None
|
||||
context.server_seed = None
|
||||
context.user_api_key = None
|
||||
|
||||
@@ -82,6 +85,11 @@ def step_server_embeddings(context):
|
||||
context.server_embeddings = True
|
||||
|
||||
|
||||
@step(u'prometheus compatible metrics exposed')
|
||||
def step_server_metrics(context):
|
||||
context.server_metrics = True
|
||||
|
||||
|
||||
@step(u"the server is starting")
|
||||
def step_start_server(context):
|
||||
start_server_background(context)
|
||||
@@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
|
||||
assert context.options_response.headers[cors_header] == cors_header_value
|
||||
|
||||
|
||||
@step(u'prometheus metrics are exposed')
|
||||
@async_run_until_complete
|
||||
async def step_prometheus_metrics_exported(context):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
||||
assert metrics_response.status == 200
|
||||
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
||||
metrics_raw = await metrics_response.text()
|
||||
metric_exported = False
|
||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||
match metric.name:
|
||||
case "llamacpp:kv_cache_usage_ratio":
|
||||
assert len(metric.samples) > 0
|
||||
metric_exported = True
|
||||
assert metric_exported, "No metrics exported"
|
||||
|
||||
|
||||
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||
n_prompts = len(context.prompts)
|
||||
if context.debug:
|
||||
@@ -674,6 +699,8 @@ async def wait_for_health_status(context,
|
||||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
timeout = 3 # seconds
|
||||
if expected_health_status == 'ok':
|
||||
timeout = 10 # CI slow inference
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
async with aiohttp.ClientSession() as session:
|
||||
@@ -711,7 +738,7 @@ async def wait_for_health_status(context,
|
||||
if n_completions > 0:
|
||||
return
|
||||
|
||||
assert False, 'timeout exceeded'
|
||||
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||
|
||||
|
||||
def assert_embeddings(embeddings):
|
||||
@@ -753,6 +780,8 @@ def start_server_background(context):
|
||||
server_args.append('--cont-batching')
|
||||
if context.server_embeddings:
|
||||
server_args.append('--embedding')
|
||||
if context.server_metrics:
|
||||
server_args.append('--metrics')
|
||||
if context.model_alias is not None:
|
||||
server_args.extend(['--alias', context.model_alias])
|
||||
if context.n_ctx is not None:
|
||||
@@ -765,6 +794,8 @@ def start_server_background(context):
|
||||
server_args.extend(['--api-key', context.server_api_key])
|
||||
if context.debug:
|
||||
server_args.append('--verbose')
|
||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||
server_args.extend(['--log-format', "text"])
|
||||
print(f"starting server with: {context.server_path}", *server_args)
|
||||
context.server_process = subprocess.Popen(
|
||||
[str(arg) for arg in [context.server_path, *server_args]],
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
aiohttp~=3.9.3
|
||||
behave~=1.2.6
|
||||
openai~=0.25.0
|
||||
prometheus-client~=0.20.0
|
||||
|
||||
+55
-29
@@ -14,6 +14,7 @@
|
||||
using json = nlohmann::json;
|
||||
|
||||
extern bool server_verbose;
|
||||
extern bool server_log_json;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
@@ -27,14 +28,14 @@ extern bool server_verbose;
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// parallel
|
||||
@@ -50,7 +51,7 @@ enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE,
|
||||
TASK_TYPE_SLOTS_DATA
|
||||
TASK_TYPE_METRICS
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
@@ -133,26 +134,48 @@ struct completion_token_output
|
||||
std::string text_to_send;
|
||||
};
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line,
|
||||
const char *message, const nlohmann::ordered_json &extra)
|
||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
|
||||
{
|
||||
nlohmann::ordered_json log
|
||||
{
|
||||
std::stringstream ss_tid;
|
||||
ss_tid << std::this_thread::get_id();
|
||||
json log = nlohmann::ordered_json{
|
||||
{"tid", ss_tid.str()},
|
||||
{"timestamp", time(nullptr)},
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"message", message},
|
||||
};
|
||||
|
||||
if (!extra.empty())
|
||||
{
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
if (server_log_json) {
|
||||
log.merge_patch(
|
||||
{
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"msg", message},
|
||||
});
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
||||
} else {
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << buf << " |";
|
||||
for (const auto& el : log.items())
|
||||
{
|
||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
|
||||
ss << buf;
|
||||
}
|
||||
|
||||
const std::string str = ss.str();
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
@@ -234,6 +257,7 @@ struct llama_server_queue {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", task.id}});
|
||||
}
|
||||
queue_tasks.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
@@ -249,7 +273,9 @@ struct llama_server_queue {
|
||||
// Get the next id for creating anew task
|
||||
int get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return id++;
|
||||
int new_id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", new_id}});
|
||||
return new_id;
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
@@ -290,8 +316,7 @@ struct llama_server_queue {
|
||||
void start_loop() {
|
||||
running = true;
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
LOG_VERBOSE("new task may arrive", {});
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
@@ -303,7 +328,7 @@ struct llama_server_queue {
|
||||
task_server task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {});
|
||||
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
||||
callback_new_task(task);
|
||||
}
|
||||
LOG_VERBOSE("callback_all_task_finished", {});
|
||||
@@ -384,11 +409,13 @@ struct llama_server_response {
|
||||
std::condition_variable condition_results;
|
||||
|
||||
void add_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(task_id);
|
||||
}
|
||||
|
||||
void remove_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(task_id);
|
||||
}
|
||||
@@ -401,7 +428,6 @@ struct llama_server_response {
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
LOG_VERBOSE("condition_results unblock", {});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
@@ -426,22 +452,22 @@ struct llama_server_response {
|
||||
// Send a new result to a waiting task_id
|
||||
void send(task_result result) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
LOG_VERBOSE("send new result", {});
|
||||
LOG_VERBOSE("send new result", {{"task_id", result.id}});
|
||||
for (auto& task_id : waiting_task_ids) {
|
||||
// LOG_TEE("waiting task id %i \n", task_id);
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (result.multitask_id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("callback_update_multitask", {});
|
||||
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
|
||||
callback_update_multitask(task_id, result.id, result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("queue_results.push_back", {});
|
||||
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
|
||||
queue_results.push_back(result);
|
||||
condition_results.notify_one();
|
||||
condition_results.notify_all();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1708118438,
|
||||
"narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
|
||||
"lastModified": 1708655239,
|
||||
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
|
||||
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
+361
-5
@@ -523,6 +523,17 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
// 2.5625 bpw quants
|
||||
#define QR2_S 8
|
||||
#define QI2_S (QK_K / (4*QR2_S))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_s;
|
||||
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
||||
|
||||
#define QR3_XXS 8
|
||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||
typedef struct {
|
||||
@@ -1689,6 +1700,265 @@ static const __device__ uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const __device__ uint64_t iq2s_grid[1024] = {
|
||||
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
||||
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
||||
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
||||
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
||||
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
||||
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
||||
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
||||
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
||||
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
||||
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
||||
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
||||
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
||||
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
||||
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
||||
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
||||
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
||||
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
||||
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
||||
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
||||
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
||||
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
||||
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
||||
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
||||
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
||||
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
||||
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
||||
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
||||
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
||||
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
||||
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
||||
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
||||
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
||||
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
||||
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
||||
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
||||
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
||||
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
||||
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
||||
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
||||
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
||||
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
||||
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
||||
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
||||
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
||||
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
||||
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
||||
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
||||
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
||||
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
||||
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
||||
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
||||
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
||||
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
||||
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
||||
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
||||
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
||||
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
||||
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
||||
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
||||
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
||||
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
||||
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
||||
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
||||
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
||||
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
||||
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
||||
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
||||
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
||||
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
||||
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
||||
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
||||
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
||||
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
||||
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
||||
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
||||
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
||||
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
||||
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
||||
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
||||
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
||||
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
||||
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
||||
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
||||
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
||||
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
||||
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
||||
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
||||
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
||||
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
||||
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
||||
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
||||
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
||||
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
||||
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
||||
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
||||
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
||||
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
||||
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
||||
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
||||
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
||||
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
||||
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
||||
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
||||
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
||||
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
||||
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
||||
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
||||
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
||||
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
||||
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
||||
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
||||
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
||||
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
||||
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
||||
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
||||
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
||||
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
||||
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
||||
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
||||
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
||||
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
||||
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
||||
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
||||
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
||||
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
||||
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
||||
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
||||
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
||||
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
||||
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
||||
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
||||
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
||||
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
||||
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
||||
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
||||
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
||||
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
||||
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
||||
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
||||
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
||||
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
||||
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
||||
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
||||
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
||||
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
||||
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
||||
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
||||
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
||||
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
||||
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
||||
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
||||
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
||||
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
||||
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
||||
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
||||
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
||||
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
||||
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
||||
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
||||
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
||||
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
||||
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
||||
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
||||
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
||||
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
||||
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
||||
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
||||
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
||||
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
||||
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
||||
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
||||
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
||||
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
||||
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
||||
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
||||
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
||||
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
||||
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
||||
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
||||
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
||||
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
||||
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
||||
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
||||
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
||||
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
||||
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
||||
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
||||
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
||||
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
||||
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
||||
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
||||
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
||||
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
||||
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
||||
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
||||
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
||||
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
||||
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
||||
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
||||
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
||||
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
||||
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
||||
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
||||
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
||||
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
||||
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
||||
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
||||
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
||||
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
||||
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
||||
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
||||
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
||||
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
||||
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
||||
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
||||
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
||||
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
||||
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
||||
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
||||
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
||||
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
||||
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
||||
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
||||
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
||||
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
||||
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
||||
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
||||
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
||||
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
||||
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
||||
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
||||
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
||||
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
||||
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
||||
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
||||
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
||||
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
||||
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
||||
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
||||
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
||||
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
||||
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
||||
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
||||
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
||||
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
||||
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
||||
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
||||
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
||||
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
||||
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
||||
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
||||
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
||||
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
||||
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
||||
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
||||
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const __device__ uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
@@ -2037,6 +2307,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
@@ -4800,6 +5091,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
||||
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
||||
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
||||
int sumi1 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
||||
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
||||
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
||||
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
||||
q8 += 8;
|
||||
}
|
||||
int sumi2 = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
||||
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
||||
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
||||
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
||||
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||
#else
|
||||
(void) ksigns64;
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
(void) ksigns64;
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
@@ -6996,6 +7335,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
@@ -7057,6 +7402,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
@@ -7098,6 +7445,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return dequantize_row_iq2_s_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
@@ -8079,8 +8428,8 @@ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual
|
||||
*actual_size = look_ahead_size;
|
||||
g_cuda_pool_size[device] += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
|
||||
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
|
||||
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
||||
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
@@ -8166,7 +8515,7 @@ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual
|
||||
g_cuda_pool_used[device] += size;
|
||||
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
|
||||
printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
@@ -8176,7 +8525,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
|
||||
printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
g_cuda_pool_used[device] -= size;
|
||||
@@ -8848,6 +9197,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
@@ -8874,6 +9224,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
@@ -8971,6 +9322,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
@@ -11710,7 +12065,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S) {
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
||||
a_type == GGML_TYPE_IQ2_S) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
+31
-6
@@ -62,6 +62,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
@@ -87,6 +88,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
@@ -108,6 +110,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
@@ -126,6 +129,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
@@ -144,6 +148,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
@@ -458,6 +463,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
@@ -483,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
@@ -504,6 +511,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -522,6 +530,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -540,6 +549,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
@@ -1358,6 +1368,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
@@ -1500,6 +1511,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1544,9 +1561,9 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||
@@ -1658,6 +1675,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
@@ -1803,6 +1821,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1863,9 +1887,9 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
||||
}
|
||||
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
||||
@@ -1925,6 +1949,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
|
||||
@@ -2519,6 +2519,14 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
||||
|
||||
// 2.5625 bpw quants
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_s;
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
@@ -3774,6 +3782,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint64_t iq2s_grid[1024] = {
|
||||
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
||||
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
||||
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
||||
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
||||
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
||||
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
||||
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
||||
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
||||
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
||||
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
||||
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
||||
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
||||
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
||||
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
||||
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
||||
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
||||
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
||||
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
||||
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
||||
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
||||
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
||||
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
||||
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
||||
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
||||
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
||||
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
||||
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
||||
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
||||
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
||||
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
||||
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
||||
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
||||
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
||||
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
||||
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
||||
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
||||
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
||||
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
||||
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
||||
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
||||
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
||||
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
||||
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
||||
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
||||
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
||||
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
||||
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
||||
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
||||
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
||||
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
||||
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
||||
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
||||
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
||||
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
||||
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
||||
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
||||
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
||||
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
||||
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
||||
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
||||
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
||||
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
||||
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
||||
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
||||
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
||||
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
||||
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
||||
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
||||
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
||||
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
||||
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
||||
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
||||
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
||||
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
||||
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
||||
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
||||
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
||||
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
||||
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
||||
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
||||
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
||||
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
||||
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
||||
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
||||
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
||||
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
||||
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
||||
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
||||
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
||||
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
||||
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
||||
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
||||
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
||||
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
||||
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
||||
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
||||
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
||||
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
||||
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
||||
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
||||
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
||||
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
||||
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
||||
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
||||
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
||||
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
||||
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
||||
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
||||
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
||||
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
||||
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
||||
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
||||
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
||||
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
||||
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
||||
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
||||
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
||||
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
||||
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
||||
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
||||
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
||||
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
||||
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
||||
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
||||
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
||||
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
||||
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
||||
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
||||
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
||||
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
||||
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
||||
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
||||
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
||||
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
||||
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
||||
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
||||
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
||||
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
||||
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
||||
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
||||
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
||||
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
||||
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
||||
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
||||
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
||||
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
||||
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
||||
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
||||
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
||||
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
||||
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
||||
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
||||
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
||||
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
||||
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
||||
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
||||
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
||||
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
||||
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
||||
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
||||
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
||||
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
||||
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
||||
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
||||
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
||||
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
||||
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
||||
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
||||
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
||||
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
||||
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
||||
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
||||
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
||||
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
||||
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
||||
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
||||
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
||||
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
||||
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
||||
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
||||
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
||||
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
||||
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
||||
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
||||
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
||||
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
||||
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
||||
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
||||
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
||||
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
||||
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
||||
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
||||
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
||||
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
||||
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
||||
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
||||
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
||||
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
||||
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
||||
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
||||
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
||||
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
||||
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
||||
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
||||
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
||||
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
||||
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
||||
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
||||
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
||||
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
||||
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
||||
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
||||
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
||||
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
||||
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
||||
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
||||
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
||||
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
||||
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
||||
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
||||
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
||||
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
||||
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
||||
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
||||
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
||||
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
||||
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
||||
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
||||
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
||||
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
||||
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
||||
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
||||
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
||||
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
||||
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
||||
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
||||
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
||||
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
||||
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
||||
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
||||
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
||||
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
||||
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
||||
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
||||
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
||||
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
||||
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
||||
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
||||
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
||||
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
||||
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
||||
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
||||
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
||||
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
||||
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
||||
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
@@ -4572,6 +4839,139 @@ kernel void kernel_mul_mv_iq3_s_f32(
|
||||
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq2_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
//threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
||||
//{
|
||||
// int nval = 32;
|
||||
// int pos = (32*sgitg + tiisg)*nval;
|
||||
// for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
|
||||
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//}
|
||||
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq2_s * xr = x + ibl;
|
||||
device const uint8_t * qs = xr->qs + 4 * ib;
|
||||
device const uint8_t * qh = xr->qh + ib;
|
||||
device const uint8_t * sc = xr->scales + ib;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const float d1 = db * (0.5f + (sc[0] & 0xf));
|
||||
const float d2 = db * (0.5f + (sc[0] >> 4));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
//const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
||||
//const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
|
||||
sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
|
||||
}
|
||||
}
|
||||
sumf[row] += d1 * sum[0] + d2 * sum[1];
|
||||
|
||||
dh += nb*sizeof(block_iq2_s)/2;
|
||||
qs += nb*sizeof(block_iq2_s);
|
||||
qh += nb*sizeof(block_iq2_s);
|
||||
sc += nb*sizeof(block_iq2_s);
|
||||
signs += nb*sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq2_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq2_s_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq1_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -5188,6 +5588,25 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
||||
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
@@ -5762,6 +6181,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5804,6 +6224,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5858,6 +6279,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -6893,6 +7315,71 @@ kernel void kernel_mul_mv_id_iq3_s_f32(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq2_s_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq2_s_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||
device const char * ids,
|
||||
|
||||
+796
-12
@@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
@@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3470,6 +3495,265 @@ static const uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const uint64_t iq2s_grid[1024] = {
|
||||
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
||||
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
||||
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
||||
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
||||
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
||||
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
||||
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
||||
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
||||
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
||||
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
||||
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
||||
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
||||
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
||||
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
||||
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
||||
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
||||
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
||||
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
||||
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
||||
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
||||
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
||||
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
||||
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
||||
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
||||
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
||||
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
||||
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
||||
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
||||
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
||||
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
||||
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
||||
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
||||
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
||||
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
||||
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
||||
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
||||
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
||||
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
||||
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
||||
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
||||
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
||||
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
||||
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
||||
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
||||
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
||||
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
||||
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
||||
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
||||
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
||||
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
||||
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
||||
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
||||
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
||||
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
||||
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
||||
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
||||
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
||||
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
||||
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
||||
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
||||
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
||||
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
||||
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
||||
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
||||
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
||||
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
||||
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
||||
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
||||
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
||||
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
||||
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
||||
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
||||
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
||||
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
||||
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
||||
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
||||
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
||||
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
||||
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
||||
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
||||
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
||||
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
||||
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
||||
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
||||
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
||||
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
||||
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
||||
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
||||
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
||||
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
||||
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
||||
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
||||
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
||||
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
||||
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
||||
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
||||
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
||||
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
||||
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
||||
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
||||
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
||||
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
||||
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
||||
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
||||
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
||||
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
||||
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
||||
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
||||
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
||||
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
||||
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
||||
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
||||
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
||||
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
||||
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
||||
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
||||
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
||||
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
||||
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
||||
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
||||
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
||||
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
||||
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
||||
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
||||
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
||||
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
||||
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
||||
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
||||
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
||||
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
||||
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
||||
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
||||
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
||||
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
||||
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
||||
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
||||
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
||||
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
||||
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
||||
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
||||
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
||||
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
||||
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
||||
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
||||
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
||||
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
||||
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
||||
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
||||
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
||||
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
||||
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
||||
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
||||
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
||||
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
||||
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
||||
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
||||
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
||||
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
||||
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
||||
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
||||
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
||||
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
||||
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
||||
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
||||
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
||||
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
||||
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
||||
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
||||
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
||||
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
||||
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
||||
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
||||
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
||||
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
||||
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
||||
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
||||
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
||||
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
||||
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
||||
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
||||
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
||||
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
||||
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
||||
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
||||
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
||||
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
||||
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
||||
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
||||
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
||||
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
||||
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
||||
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
||||
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
||||
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
||||
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
||||
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
||||
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
||||
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
||||
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
||||
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
||||
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
||||
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
||||
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
||||
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
||||
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
||||
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
||||
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
||||
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
||||
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
||||
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
||||
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
||||
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
||||
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
||||
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
||||
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
||||
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
||||
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
||||
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
||||
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
||||
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
||||
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
||||
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
||||
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
||||
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
||||
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
||||
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
||||
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
||||
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
||||
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
||||
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
||||
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
||||
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
||||
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
||||
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
||||
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
||||
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
||||
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
||||
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
||||
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
||||
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
||||
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
||||
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
||||
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
||||
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
||||
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
||||
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
||||
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
||||
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
||||
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
||||
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
||||
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
||||
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
||||
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
||||
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
||||
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
||||
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
@@ -3771,6 +4055,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== 2.5625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
float db[2];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
||||
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const float dl = db[l/2];
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== 3.0625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
||||
@@ -9305,6 +9621,210 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_iq2_s * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||
};
|
||||
|
||||
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
||||
|
||||
const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
|
||||
const uint8x16_t mask2 = vld1q_u8(k_mask2);
|
||||
const uint8x16_t m1 = vdupq_n_u8(1);
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
|
||||
uint8x16x2_t vs;
|
||||
ggml_int8x16x4_t q2s;
|
||||
ggml_int8x16x4_t q8b;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict qs = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
|
||||
vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
|
||||
q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
|
||||
vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
|
||||
q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
|
||||
vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
|
||||
q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
|
||||
vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
|
||||
qs += 8;
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
|
||||
q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
signs += 4;
|
||||
|
||||
q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
|
||||
q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
|
||||
|
||||
const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
|
||||
const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
|
||||
const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
|
||||
|
||||
sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
|
||||
sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4));
|
||||
sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
|
||||
sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4));
|
||||
}
|
||||
sumf += d*(sumi1 + sumi2);
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||
};
|
||||
|
||||
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
};
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||
const __m128i m1 = _mm_set1_epi8(1);
|
||||
|
||||
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
||||
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
||||
|
||||
uint64_t aux64;
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict qs = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
memcpy(&aux64, x[i].scales, 8);
|
||||
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
||||
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
||||
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
||||
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
||||
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
||||
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
||||
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
||||
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
||||
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
||||
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
||||
qs += 8;
|
||||
|
||||
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
||||
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
||||
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
||||
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
||||
|
||||
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
||||
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
||||
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
||||
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
||||
|
||||
signs += 4;
|
||||
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
||||
|
||||
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
||||
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
||||
sumi1 = _mm256_add_epi32(sumi1, p1);
|
||||
sumi2 = _mm256_add_epi32(sumi2, p2);
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
|
||||
}
|
||||
|
||||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
@@ -9488,8 +10008,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
qs += 16;
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
@@ -9497,8 +10017,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
|
||||
|
||||
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
|
||||
vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
|
||||
vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
|
||||
vs.val[0] = vceqq_u8(vs.val[0], mask2);
|
||||
vs.val[1] = vceqq_u8(vs.val[1], mask2);
|
||||
|
||||
@@ -9909,22 +10429,25 @@ typedef struct {
|
||||
uint16_t * neighbours;
|
||||
} iq2_entry_t;
|
||||
|
||||
static iq2_entry_t iq2_data[3] = {
|
||||
static iq2_entry_t iq2_data[4] = {
|
||||
{NULL, NULL, NULL},
|
||||
{NULL, NULL, NULL},
|
||||
{NULL, NULL, NULL},
|
||||
{NULL, NULL, NULL},
|
||||
};
|
||||
|
||||
static inline int iq2_data_index(enum ggml_type type) {
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
||||
return type == GGML_TYPE_IQ2_XXS ? 0 :
|
||||
type == GGML_TYPE_IQ2_XS ? 1 : 2;
|
||||
type == GGML_TYPE_IQ2_XS ? 1 :
|
||||
type == GGML_TYPE_IQ1_S ? 2 : 3;
|
||||
}
|
||||
|
||||
static inline int iq2_grid_size(enum ggml_type type) {
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
||||
return type == GGML_TYPE_IQ2_XXS ? 256 :
|
||||
type == GGML_TYPE_IQ2_XS ? 512 : 512;
|
||||
type == GGML_TYPE_IQ2_XS ? 512 :
|
||||
type == GGML_TYPE_IQ1_S ? 512 : 1024;
|
||||
}
|
||||
|
||||
static int iq2_compare_func(const void * left, const void * right) {
|
||||
@@ -10025,11 +10548,79 @@ void iq2xs_init_impl(enum ggml_type type) {
|
||||
41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
|
||||
42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
|
||||
};
|
||||
static const uint16_t kgrid_2bit_1024[1024] = {
|
||||
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
||||
73, 80, 82, 85, 88, 97, 100, 102, 105, 128, 130, 133, 136, 145, 148, 160,
|
||||
165, 170, 257, 260, 262, 265, 272, 274, 277, 280, 289, 292, 320, 322, 325, 328,
|
||||
337, 340, 342, 345, 352, 357, 360, 385, 388, 400, 402, 405, 417, 420, 512, 514,
|
||||
517, 520, 529, 532, 544, 554, 577, 580, 582, 585, 592, 597, 640, 645, 650, 660,
|
||||
674, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1062, 1065, 1088, 1090, 1093,
|
||||
1096, 1098, 1105, 1108, 1110, 1113, 1120, 1122, 1125, 1153, 1156, 1158, 1161, 1168, 1173, 1176,
|
||||
1185, 1188, 1280, 1282, 1285, 1288, 1290, 1297, 1300, 1302, 1305, 1312, 1317, 1320, 1345, 1348,
|
||||
1350, 1353, 1360, 1362, 1365, 1368, 1377, 1380, 1408, 1410, 1413, 1416, 1425, 1428, 1440, 1537,
|
||||
1540, 1542, 1545, 1552, 1557, 1600, 1605, 1608, 1617, 1620, 1632, 1665, 1668, 1680, 2048, 2050,
|
||||
2053, 2056, 2065, 2068, 2070, 2073, 2080, 2085, 2090, 2113, 2116, 2118, 2121, 2128, 2130, 2133,
|
||||
2136, 2145, 2148, 2176, 2181, 2196, 2218, 2305, 2308, 2320, 2322, 2325, 2328, 2337, 2368, 2373,
|
||||
2376, 2385, 2388, 2400, 2433, 2448, 2560, 2577, 2580, 2594, 2600, 2602, 2640, 2713, 4097, 4100,
|
||||
4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4134, 4160, 4162, 4165, 4168, 4177, 4180, 4182,
|
||||
4185, 4192, 4194, 4197, 4200, 4225, 4228, 4230, 4240, 4245, 4248, 4257, 4260, 4352, 4354, 4357,
|
||||
4360, 4362, 4369, 4372, 4374, 4377, 4384, 4386, 4389, 4392, 4417, 4420, 4422, 4425, 4432, 4434,
|
||||
4437, 4440, 4449, 4452, 4480, 4482, 4485, 4488, 4497, 4500, 4609, 4612, 4617, 4624, 4629, 4641,
|
||||
4644, 4672, 4677, 4689, 4692, 4737, 4740, 4752, 5120, 5122, 5125, 5128, 5137, 5140, 5142, 5145,
|
||||
5152, 5157, 5160, 5185, 5188, 5190, 5193, 5200, 5202, 5205, 5208, 5217, 5220, 5248, 5250, 5253,
|
||||
5256, 5265, 5268, 5280, 5377, 5380, 5382, 5385, 5392, 5394, 5397, 5400, 5409, 5412, 5440, 5442,
|
||||
5445, 5448, 5457, 5460, 5472, 5505, 5508, 5520, 5632, 5637, 5640, 5649, 5652, 5664, 5697, 5700,
|
||||
5712, 5760, 5802, 6145, 6148, 6150, 6153, 6160, 6165, 6168, 6177, 6208, 6210, 6213, 6216, 6225,
|
||||
6228, 6240, 6273, 6276, 6400, 6402, 6405, 6408, 6417, 6420, 6432, 6465, 6468, 6480, 6505, 6562,
|
||||
6660, 6672, 6720, 6742, 8192, 8194, 8197, 8200, 8209, 8212, 8214, 8217, 8224, 8229, 8234, 8257,
|
||||
8260, 8272, 8274, 8277, 8292, 8320, 8330, 8340, 8362, 8449, 8452, 8464, 8466, 8469, 8481, 8512,
|
||||
8514, 8517, 8529, 8532, 8544, 8577, 8580, 8592, 8704, 8714, 8738, 8744, 8746, 8772, 8784, 8840,
|
||||
8842, 8872, 9217, 9220, 9222, 9225, 9232, 9237, 9240, 9249, 9252, 9280, 9282, 9285, 9288, 9297,
|
||||
9300, 9312, 9345, 9348, 9360, 9472, 9477, 9480, 9489, 9492, 9504, 9537, 9540, 9552, 9574, 9600,
|
||||
9729, 9732, 9744, 9792, 9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
|
||||
10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
|
||||
16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
|
||||
16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
|
||||
16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
|
||||
16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
|
||||
17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
|
||||
17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
|
||||
17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
|
||||
17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
|
||||
18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
|
||||
18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
|
||||
18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
|
||||
20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
|
||||
20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
|
||||
20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
|
||||
21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
|
||||
21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
|
||||
22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
|
||||
22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
|
||||
24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
|
||||
24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
|
||||
25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
|
||||
26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
|
||||
32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
|
||||
33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
|
||||
33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
|
||||
33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
|
||||
34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
|
||||
35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
|
||||
36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
|
||||
37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
|
||||
38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
|
||||
39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
|
||||
41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
|
||||
42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
|
||||
};
|
||||
|
||||
const int kmap_size = 43692;
|
||||
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
||||
//const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
|
||||
const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
|
||||
const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
|
||||
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : kgrid_1bit_512;
|
||||
type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 :
|
||||
type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024;
|
||||
uint64_t * kgrid_q2xs;
|
||||
int * kmap_q2xs;
|
||||
uint16_t * kneighbors_q2xs;
|
||||
@@ -10126,7 +10717,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
||||
}
|
||||
|
||||
void iq2xs_free_impl(enum ggml_type type) {
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
|
||||
GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
|
||||
const int gindex = iq2_data_index(type);
|
||||
if (iq2_data[gindex].grid) {
|
||||
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
||||
@@ -11532,3 +12123,196 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
||||
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
||||
}
|
||||
|
||||
// =============================== 2.5625 bpw
|
||||
|
||||
static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||
|
||||
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
||||
|
||||
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
||||
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||
|
||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
||||
const int nbl = n/256;
|
||||
|
||||
block_iq2_s * y = vy;
|
||||
|
||||
float scales[QK_K/16];
|
||||
float weight[16];
|
||||
float xval[16];
|
||||
int8_t L[16];
|
||||
int8_t Laux[16];
|
||||
float waux[16];
|
||||
bool is_on_grid[2];
|
||||
bool is_on_grid_aux[2];
|
||||
uint8_t block_signs[2];
|
||||
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||
|
||||
memset(&y[ibl], 0, sizeof(block_iq2_s));
|
||||
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||
|
||||
float max_scale = 0;
|
||||
|
||||
const float * xbl = x + QK_K*ibl;
|
||||
float sumx2 = 0;
|
||||
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||
float sigma2 = 2*sumx2/QK_K;
|
||||
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) {
|
||||
const float * xb = xbl + 16*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
||||
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
uint8_t s = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
||||
else {
|
||||
xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
|
||||
}
|
||||
}
|
||||
block_signs[k] = s;
|
||||
}
|
||||
float max = xval[0];
|
||||
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
||||
if (!max) {
|
||||
scales[ib] = 0;
|
||||
continue;
|
||||
}
|
||||
float best = 0;
|
||||
float scale = max/(2*kMaxQ-1);
|
||||
is_on_grid[0] = is_on_grid[1] = true;
|
||||
for (int is = -9; is <= 9; ++is) {
|
||||
float id = (2*kMaxQ-1+is*0.1f)/max;
|
||||
float this_scale = 1/id;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
||||
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
||||
}
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
||||
int grid_index = kmap_q2xs[u];
|
||||
is_on_grid_aux[k] = true;
|
||||
if (grid_index < 0) {
|
||||
is_on_grid_aux[k] = false;
|
||||
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
||||
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
||||
}
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*Laux[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
scale = sumqx/sumq2; best = scale*sumqx;
|
||||
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
||||
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
||||
}
|
||||
}
|
||||
int n_not_ongrid = 0;
|
||||
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
||||
if (n_not_ongrid > 0 && scale > 0) {
|
||||
float id = 1/scale;
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
if (is_on_grid[k]) continue;
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
||||
l = MAX(0, MIN(kMaxQ-1, l));
|
||||
u |= (l << 2*i);
|
||||
L[8*k + i] = l;
|
||||
}
|
||||
int grid_index = kmap_q2xs[u];
|
||||
if (grid_index < 0) {
|
||||
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
||||
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
||||
}
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*L[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0) scale = sumqx/sumq2;
|
||||
}
|
||||
if (scale < 0) {
|
||||
scale = -scale;
|
||||
for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
|
||||
}
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
||||
int grid_index = kmap_q2xs[u];
|
||||
if (grid_index < 0) {
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
const int i8 = 2*ib + k;
|
||||
y[ibl].qs[i8] = grid_index & 255;
|
||||
y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
|
||||
y[ibl].qs[QK_K/8 + i8] = block_signs[k];
|
||||
}
|
||||
GGML_ASSERT(scale >= 0);
|
||||
scales[ib] = scale;
|
||||
max_scale = MAX(max_scale, scale);
|
||||
}
|
||||
|
||||
if (!max_scale) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float d = max_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
|
||||
float id = 1/d;
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) {
|
||||
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
||||
l = MAX(0, MIN(15, l));
|
||||
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
||||
else y[ibl].scales[ib/2] |= (l << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq2_s);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
||||
}
|
||||
|
||||
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq2_s * restrict y = vy;
|
||||
quantize_row_iq2_s_reference(x, y, k);
|
||||
}
|
||||
|
||||
@@ -182,6 +182,15 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
// 2.5625 bpw quants
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_s;
|
||||
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
||||
|
||||
// (Almost) "true" 3-bit quantization.
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
@@ -242,6 +251,7 @@ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGM
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
@@ -259,6 +269,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -276,6 +287,7 @@ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRI
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -295,6 +307,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -305,6 +318,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
//
|
||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
+166
-82
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
||||
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
||||
}
|
||||
|
||||
static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
|
||||
const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int rowx = item_ct1.get_group(2);
|
||||
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
||||
|
||||
const int block_size = item_ct1.get_local_range(2);
|
||||
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
||||
|
||||
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
|
||||
float slope = 0.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const uint32_t h = rowx/nrows_y; // head index
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = sycl::pow(base, float(exp));
|
||||
}
|
||||
|
||||
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
||||
float max_val = -INFINITY;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
|
||||
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = sycl::max(max_val, val);
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = -INFINITY;
|
||||
}
|
||||
/*
|
||||
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = max_val;
|
||||
}
|
||||
/*
|
||||
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
max_val = buf[lane_id];
|
||||
max_val = warp_reduce_max(max_val, item_ct1);
|
||||
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
|
||||
float tmp = 0.f;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
const float val =
|
||||
sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = sycl::native::exp(vals[col] - max_val);
|
||||
tmp += val;
|
||||
dst[ix] = val;
|
||||
vals[col] = val;
|
||||
}
|
||||
|
||||
// find the sum of exps in the block
|
||||
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = 0.f;
|
||||
}
|
||||
/*
|
||||
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
tmp = buf[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
const float inv_tmp = 1.f / tmp;
|
||||
const float inv_sum = 1.f / tmp;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = rowx*ncols + col;
|
||||
dst[i] *= inv_tmp;
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int idst = rowx*ncols + col;
|
||||
dst[idst] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10867,35 +10869,96 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale,
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
local_buf_acc.get_pointer());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
||||
float * dst, const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale, const float max_bias,
|
||||
dpct::queue_ptr stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const sycl::range<3> block_dims(1, 1, nth);
|
||||
const sycl::range<3> block_nums(1, 1, nrows_x);
|
||||
/*
|
||||
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
||||
limit. To get the device limit, query info::device::max_work_group_size.
|
||||
Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
/*
|
||||
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
||||
replaced with a value. Modify the code to use the original expression,
|
||||
provided in comments, if it is correct.
|
||||
*/
|
||||
sycl::local_accessor<float, 1> buf_acc_ct1(
|
||||
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
||||
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
||||
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
|
||||
buf_acc_ct1.get_pointer());
|
||||
});
|
||||
});
|
||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
||||
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, WARP_SIZE, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
float scale = 1.0f;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
float max_bias = 0.0f;
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
(void) dst;
|
||||
// positions tensor
|
||||
float * src2_dd = nullptr;
|
||||
sycl_pool_alloc<float> src2_f;
|
||||
|
||||
ggml_tensor * src2 = dst->src[2];
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
|
||||
if (use_src2) {
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (src2_on_device) {
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||
} else {
|
||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
||||
}
|
||||
}
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
||||
nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
|
||||
@@ -690,6 +690,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_S] = {
|
||||
.type_name = "iq2_s",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq2_s),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
||||
.from_float = quantize_row_iq2_s,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
|
||||
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ1_S] = {
|
||||
.type_name = "iq1_s",
|
||||
.blck_size = QK_K,
|
||||
@@ -2317,6 +2329,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
@@ -7752,6 +7765,7 @@ static void ggml_compute_forward_add(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
@@ -8032,6 +8046,7 @@ static void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
@@ -8157,6 +8172,7 @@ static void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@@ -11056,6 +11072,7 @@ static void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
@@ -11245,6 +11262,7 @@ static void ggml_compute_forward_set(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@@ -11448,6 +11466,7 @@ static void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
@@ -12149,6 +12168,7 @@ static void ggml_compute_forward_alibi(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
@@ -12233,6 +12253,7 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
@@ -19482,6 +19503,7 @@ void ggml_quantize_init(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break;
|
||||
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
||||
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
||||
@@ -19768,6 +19790,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
|
||||
@@ -351,6 +351,7 @@ extern "C" {
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_IQ3_S = 21,
|
||||
GGML_TYPE_IQ2_S = 22,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -391,6 +392,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
||||
@@ -64,6 +64,15 @@ extern "C" {
|
||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||
enum llama_rope_type {
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
@@ -98,12 +107,14 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
@@ -360,6 +371,7 @@ extern "C" {
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
@@ -514,10 +526,12 @@ extern "C" {
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
LLAMA_API void llama_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -525,7 +539,9 @@ extern "C" {
|
||||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_div(
|
||||
@@ -535,6 +551,20 @@ extern "C" {
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Defragment the KV cache
|
||||
// This will be applied:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
||||
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
||||
@@ -1916,7 +1916,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S,
|
||||
};
|
||||
|
||||
@@ -150,6 +150,7 @@ int main(int argc, char * argv[]) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
@@ -168,7 +169,8 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
|
||||
@@ -404,7 +404,8 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||
const auto it = codepoint_types.find(cp);
|
||||
return it == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
|
||||
Reference in New Issue
Block a user