mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
39 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1f30b7a9f1 | |||
| 9d533a77d0 | |||
| cbbd1efa06 | |||
| b11a93df41 | |||
| a33e6a0d2a | |||
| 47bb7b48c7 | |||
| c4d7f81786 | |||
| e849078c6e | |||
| 67fd33132f | |||
| 4804215cb8 | |||
| 8a533f0d90 | |||
| 269de86ba0 | |||
| c393733988 | |||
| e3965cf35a | |||
| 8b350356b2 | |||
| bf08e00643 | |||
| f7625019c5 | |||
| abbabc5e51 | |||
| f1a98c5254 | |||
| 7d548a1827 | |||
| 930b178026 | |||
| d52d7819b8 | |||
| 1289408817 | |||
| ab336a9d5e | |||
| 69917dfa55 | |||
| 9e359a4f47 | |||
| 4c4cb30736 | |||
| 525213d2f5 | |||
| fd43d66f46 | |||
| 54fbcd2ce6 | |||
| 15499eb942 | |||
| 96633eeca1 | |||
| 847eedbdb2 | |||
| 7e4f339c40 | |||
| 334f76fa38 | |||
| efd56b1c21 | |||
| 201294ae17 | |||
| 5a9e2f60ba | |||
| 373ee3fbba |
@@ -0,0 +1,37 @@
|
||||
{
|
||||
lib,
|
||||
dockerTools,
|
||||
buildEnv,
|
||||
llama-cpp,
|
||||
interactive ? true,
|
||||
coreutils,
|
||||
}:
|
||||
|
||||
# A tar that can be fed into `docker load`:
|
||||
#
|
||||
# $ nix build .#llamaPackages.docker
|
||||
# $ docker load < result
|
||||
|
||||
# For details and variations cf.
|
||||
# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
|
||||
# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
|
||||
# - https://nixery.dev/
|
||||
|
||||
# Approximate (compressed) sizes, at the time of writing, are:
|
||||
#
|
||||
# .#llamaPackages.docker: 125M;
|
||||
# .#llamaPackagesCuda.docker: 537M;
|
||||
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
||||
|
||||
dockerTools.buildLayeredImage {
|
||||
name = llama-cpp.pname;
|
||||
tag = "latest";
|
||||
|
||||
contents =
|
||||
[ llama-cpp ]
|
||||
++ lib.optionals interactive [
|
||||
coreutils
|
||||
dockerTools.binSh
|
||||
dockerTools.caCertificates
|
||||
];
|
||||
}
|
||||
@@ -12,5 +12,8 @@ lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
sif = self.callPackage ./sif.nix { };
|
||||
}
|
||||
)
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
{
|
||||
lib,
|
||||
singularity-tools,
|
||||
llama-cpp,
|
||||
bashInteractive,
|
||||
interactive ? false,
|
||||
}:
|
||||
|
||||
let
|
||||
optionalInt = cond: x: if cond then x else 0;
|
||||
in
|
||||
singularity-tools.buildImage rec {
|
||||
inherit (llama-cpp) name;
|
||||
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||
|
||||
# These are excessive (but safe) for most variants. Building singularity
|
||||
# images requires superuser privileges, so we build them inside a VM in a
|
||||
# writable image of pre-determined size.
|
||||
#
|
||||
# ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
|
||||
#
|
||||
# Expected image sizes:
|
||||
# - cpu/blas: 150M,
|
||||
# - cuda, all gencodes: 560M,
|
||||
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
||||
memSize = diskSize;
|
||||
}
|
||||
@@ -7,3 +7,5 @@ assignees: ''
|
||||
---
|
||||
|
||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||
|
||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
||||
|
||||
@@ -669,8 +669,7 @@ jobs:
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
# Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
|
||||
./gradlew build --no-daemon -Pskip-armeabi-v7a
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
# Server build and tests
|
||||
name: Server
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
exclude:
|
||||
- build_type: Release
|
||||
sanitizer: ADDRESS
|
||||
- build_type: Release
|
||||
sanitizer: THREAD
|
||||
- build_type: Release
|
||||
sanitizer: UNDEFINED
|
||||
|
||||
container:
|
||||
image: ubuntu:latest
|
||||
ports:
|
||||
- 8888
|
||||
options: --cpus 4
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get -y install \
|
||||
build-essential \
|
||||
git \
|
||||
cmake \
|
||||
python3-pip \
|
||||
wget \
|
||||
psmisc
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
|
||||
- name: Download models
|
||||
id: download_models
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_test
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
PORT=8888 ./tests.sh
|
||||
+8
-2
@@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
|
||||
@@ -381,8 +381,13 @@ ifdef LLAMA_BLIS
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
ifdef LLAMA_CUBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
||||
ifneq ('', '$(wildcard /opt/cuda)')
|
||||
CUDA_PATH ?= /opt/cuda
|
||||
else
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
endif
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
||||
OBJS += ggml-cuda.o
|
||||
MK_NVCCFLAGS += -use_fast_math
|
||||
ifdef LLAMA_FATAL_WARNINGS
|
||||
@@ -597,7 +602,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
|
||||
@@ -114,6 +114,9 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -155,6 +158,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
- [Msty](https://msty.app) (proprietary)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
|
||||
---
|
||||
|
||||
|
||||
+18
-9
@@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
} else if (arg == "--rope-scale") {
|
||||
if (++i >= argc) {
|
||||
@@ -335,6 +335,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
} else if (arg == "--defrag-thold" || arg == "-dt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.defrag_thold = std::stof(argv[i]);
|
||||
} else if (arg == "--samplers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -630,11 +636,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_NONE;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_ROW;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -837,15 +843,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
@@ -1004,6 +1010,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" -dt N, --defrag-thold N\n");
|
||||
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||
@@ -1285,6 +1293,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
|
||||
+3
-2
@@ -61,7 +61,7 @@ struct gpt_params {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
@@ -75,7 +75,8 @@ struct gpt_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
// // sampling parameters
|
||||
|
||||
+1
-1
@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
// }
|
||||
//}
|
||||
|
||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+5
-5
@@ -31,7 +31,7 @@ struct train_state * init_train_state() {
|
||||
|
||||
state->opt = new struct ggml_opt_context;
|
||||
state->opt->ctx = NULL;
|
||||
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
state->opt->loss_after = 0.0f;
|
||||
|
||||
@@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||
std::string opt_type;
|
||||
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
|
||||
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
|
||||
opt->params.type = GGML_OPT_ADAM;
|
||||
opt->params.type = GGML_OPT_TYPE_ADAM;
|
||||
|
||||
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
|
||||
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
|
||||
@@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
|
||||
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
|
||||
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
|
||||
opt->params.type = GGML_OPT_LBFGS;
|
||||
opt->params.type = GGML_OPT_TYPE_LBFGS;
|
||||
|
||||
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
|
||||
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
|
||||
@@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
|
||||
|
||||
switch (opt->params.type) {
|
||||
case GGML_OPT_ADAM:
|
||||
case GGML_OPT_TYPE_ADAM:
|
||||
{
|
||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
|
||||
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
|
||||
@@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||
gguf_add_tensor(fctx, opt->adam.pf);
|
||||
}
|
||||
} break;
|
||||
case GGML_OPT_LBFGS:
|
||||
case GGML_OPT_TYPE_LBFGS:
|
||||
{
|
||||
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
|
||||
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
|
||||
|
||||
+68
-10
@@ -192,7 +192,7 @@ class Model:
|
||||
return RefactModel
|
||||
if model_architecture == "PersimmonForCausalLM":
|
||||
return PersimmonModel
|
||||
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
return StableLMModel
|
||||
if model_architecture == "QWenLMHeadModel":
|
||||
return QwenModel
|
||||
@@ -218,6 +218,8 @@ class Model:
|
||||
return BertModel
|
||||
if model_architecture == "NomicBertModel":
|
||||
return NomicBertModel
|
||||
if model_architecture == "GemmaForCausalLM":
|
||||
return GemmaModel
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
@@ -251,7 +253,7 @@ class Model:
|
||||
return gguf.MODEL_ARCH.REFACT
|
||||
if arch == "PersimmonForCausalLM":
|
||||
return gguf.MODEL_ARCH.PERSIMMON
|
||||
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||
return gguf.MODEL_ARCH.STABLELM
|
||||
if arch == "QWenLMHeadModel":
|
||||
return gguf.MODEL_ARCH.QWEN
|
||||
@@ -277,6 +279,8 @@ class Model:
|
||||
return gguf.MODEL_ARCH.BERT
|
||||
if arch == "NomicBertModel":
|
||||
return gguf.MODEL_ARCH.NOMIC_BERT
|
||||
if arch == "GemmaForCausalLM":
|
||||
return gguf.MODEL_ARCH.GEMMA
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
@@ -618,11 +622,6 @@ class MPTModel(Model):
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
# note: MPT output is tied to (same as) wte in original model;
|
||||
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
||||
if new_name == "token_embd.weight":
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
class OrionModel(Model):
|
||||
def set_vocab(self):
|
||||
@@ -655,6 +654,8 @@ class OrionModel(Model):
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
# note: config provides rms norm but it is actually layer norm
|
||||
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def write_tensors(self):
|
||||
@@ -1031,7 +1032,6 @@ class PersimmonModel(Model):
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
@@ -1074,10 +1074,11 @@ class StableLMModel(Model):
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||
|
||||
|
||||
class MixtralModel(Model):
|
||||
@@ -1785,6 +1786,63 @@ class NomicBertModel(BertModel):
|
||||
yield name, data
|
||||
|
||||
|
||||
class GemmaModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_key_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
|
||||
opt_params_lbfgs.print_forward_graph = false;
|
||||
opt_params_lbfgs.print_backward_graph = false;
|
||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||
|
||||
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
|
||||
lora.hparams.n_rank_output = n_rank_output;
|
||||
|
||||
// set opt params from command line
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
|
||||
@@ -447,8 +447,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
|
||||
|
||||
static const char * split_mode_str(llama_split_mode mode) {
|
||||
switch (mode) {
|
||||
case LLAMA_SPLIT_NONE: return "none";
|
||||
case LLAMA_SPLIT_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_ROW: return "row";
|
||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
||||
default: GGML_ASSERT(!"invalid split mode");
|
||||
}
|
||||
}
|
||||
@@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {get_num_physical_cores()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* split_mode */ {LLAMA_SPLIT_LAYER},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
@@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
for (const auto & m : p) {
|
||||
llama_split_mode mode;
|
||||
if (m == "none") {
|
||||
mode = LLAMA_SPLIT_NONE;
|
||||
mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (m == "layer") {
|
||||
mode = LLAMA_SPLIT_LAYER;
|
||||
mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (m == "row") {
|
||||
mode = LLAMA_SPLIT_ROW;
|
||||
mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
||||
@@ -21,12 +21,8 @@ android {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Workaround for https://github.com/llvm/llvm-project/issues/65820
|
||||
// affecting armeabi-v7a. Skip armeabi-v7a when invoked with
|
||||
// -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
|
||||
if (project.hasProperty("skip-armeabi-v7a")) {
|
||||
abiFilters += listOf("arm64-v8a", "x86_64", "x86")
|
||||
}
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
|
||||
@@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_CPU) {
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
printf("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
|
||||
@@ -548,8 +548,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@@ -576,9 +576,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
@@ -146,10 +146,11 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= bd;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
@@ -179,10 +180,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
@@ -208,10 +211,12 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_cache_defrag (ctx);
|
||||
llama_kv_cache_update (ctx);
|
||||
|
||||
n_past -= n_discard;
|
||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,12 +23,16 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
@@ -290,6 +294,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
|
||||
fprintf(stderr, "\n===============================================================================================\n");
|
||||
fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||
|
||||
@@ -1,8 +1,20 @@
|
||||
# llama.cpp/example/server
|
||||
# LLaMA.cpp HTTP Server
|
||||
|
||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
||||
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||
|
||||
Command line options:
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantum models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Monitoring endpoints
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
@@ -39,9 +51,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, default: enabled.
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
|
||||
|
||||
## Build
|
||||
|
||||
@@ -98,6 +113,12 @@ curl --request POST \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
## Advanced testing
|
||||
|
||||
We implemented a [server test framework](./tests/README.md) using human-readable scenario.
|
||||
|
||||
*Before submitting an issue, please try to reproduce it with this format.*
|
||||
|
||||
## Node JS Test
|
||||
|
||||
You need to have [Node.js](https://nodejs.org/en) installed.
|
||||
@@ -451,6 +472,18 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
]
|
||||
```
|
||||
|
||||
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
|
||||
|
||||
Available metrics:
|
||||
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
||||
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
|
||||
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
|
||||
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
|
||||
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
|
||||
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
|
||||
- `llamacpp:requests_processing`: Number of request processing.
|
||||
- `llamacpp:requests_deferred`: Number of request deferred.
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
+384
-90
@@ -43,9 +43,11 @@ struct server_params
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
bool slots_endpoint = true;
|
||||
bool metrics_endpoint = false;
|
||||
};
|
||||
|
||||
bool server_verbose = false;
|
||||
bool server_log_json = true;
|
||||
|
||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
||||
{
|
||||
@@ -301,12 +303,76 @@ struct llama_client_slot
|
||||
}
|
||||
|
||||
void print_timings() const {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
|
||||
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
|
||||
char buffer[512];
|
||||
double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
||||
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_prompt_processing, num_prompt_tokens_processed,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
t_token = t_token_generation / n_decoded;
|
||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||
t_token_generation, n_decoded,
|
||||
t_token, n_tokens_second);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"n_decoded", n_decoded},
|
||||
{"t_token", t_token},
|
||||
{"n_tokens_second", n_tokens_second},
|
||||
});
|
||||
|
||||
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||
LOG_INFO(buffer, {
|
||||
{"slot_id", id},
|
||||
{"task_id", task_id},
|
||||
{"t_prompt_processing", t_prompt_processing},
|
||||
{"t_token_generation", t_token_generation},
|
||||
{"t_total", t_prompt_processing + t_token_generation},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_metrics {
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
|
||||
void on_prompt_eval(const llama_client_slot &slot) {
|
||||
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
||||
|
||||
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
||||
t_prompt_processing += slot.t_prompt_processing;
|
||||
}
|
||||
|
||||
void on_prediction(const llama_client_slot &slot) {
|
||||
n_tokens_predicted_total += slot.n_decoded;
|
||||
|
||||
n_tokens_predicted += slot.n_decoded;
|
||||
t_tokens_generation += slot.t_token_generation;
|
||||
}
|
||||
|
||||
void reset_bucket() {
|
||||
n_prompt_tokens_processed = 0;
|
||||
t_prompt_processing = 0;
|
||||
n_tokens_predicted = 0;
|
||||
t_tokens_generation = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -344,6 +410,8 @@ struct llama_server_context
|
||||
llama_server_queue queue_tasks;
|
||||
llama_server_response queue_results;
|
||||
|
||||
llama_metrics metrics;
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
if (ctx)
|
||||
@@ -363,7 +431,7 @@ struct llama_server_context
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
multimodal = true;
|
||||
LOG_TEE("Multi Modal Mode Enabled");
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||
@@ -416,7 +484,7 @@ struct llama_server_context
|
||||
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
|
||||
LOG_TEE("Available slots:\n");
|
||||
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
||||
for (int i = 0; i < params.n_parallel; i++)
|
||||
{
|
||||
llama_client_slot slot;
|
||||
@@ -425,7 +493,10 @@ struct llama_server_context
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
LOG_INFO("new slot", {
|
||||
{"slot_id", slot.id},
|
||||
{"n_ctx_slot", slot.n_ctx}
|
||||
});
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
@@ -435,7 +506,12 @@ struct llama_server_context
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
|
||||
LOG_INFO("slot self-extend", {
|
||||
{"slot_id", slot.id},
|
||||
{"ga_n", ga_n},
|
||||
{"ga_w", ga_w}
|
||||
});
|
||||
}
|
||||
|
||||
slot.ga_i = 0;
|
||||
@@ -729,10 +805,16 @@ struct llama_server_context
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||
LOG_ERROR("failed to load image", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||
LOG_VERBOSE("image loaded", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
@@ -792,7 +874,10 @@ struct llama_server_context
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
||||
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
|
||||
LOG_INFO("slot is processing task", {
|
||||
{"slot_id", slot->id},
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -817,10 +902,24 @@ struct llama_server_context
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0)
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
@@ -1237,6 +1336,10 @@ struct llama_server_context
|
||||
split_multiprompt_task(task_id, task);
|
||||
}
|
||||
} else {
|
||||
// an empty prompt can make slot become buggy
|
||||
if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get<std::string>().empty()) {
|
||||
task.data["prompt"] = " "; // add a space so that we have one token
|
||||
}
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
}
|
||||
@@ -1355,7 +1458,7 @@ struct llama_server_context
|
||||
if (slot == nullptr)
|
||||
{
|
||||
// if no slot is available, we defer this task for processing later
|
||||
LOG_VERBOSE("no slot is available", {});
|
||||
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
break;
|
||||
}
|
||||
@@ -1404,17 +1507,12 @@ struct llama_server_context
|
||||
case TASK_TYPE_NEXT_RESPONSE: {
|
||||
// do nothing
|
||||
} break;
|
||||
case TASK_TYPE_SLOTS_DATA: {
|
||||
case TASK_TYPE_METRICS: {
|
||||
json slots_data = json::array();
|
||||
int n_idle_slots = 0;
|
||||
int n_processing_slots = 0;
|
||||
|
||||
for (llama_client_slot &slot: slots) {
|
||||
if (slot.available()) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
n_processing_slots++;
|
||||
}
|
||||
json slot_data = get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
@@ -1429,19 +1527,48 @@ struct llama_server_context
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
if (slot_data["state"] == IDLE) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
n_processing_slots++;
|
||||
}
|
||||
slots_data.push_back(slot_data);
|
||||
}
|
||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||
LOG_INFO("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots}
|
||||
});
|
||||
LOG_VERBOSE("slot data", {
|
||||
{"task_id", task.id},
|
||||
{"n_idle_slots", n_idle_slots},
|
||||
{"n_processing_slots", n_processing_slots},
|
||||
{"slots", slots_data}
|
||||
});
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = true;
|
||||
res.error = false;
|
||||
res.result_json = {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "slots", slots_data }
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||
|
||||
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||
|
||||
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||
|
||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
metrics.reset_bucket();
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
}
|
||||
@@ -1469,7 +1596,7 @@ struct llama_server_context
|
||||
bool update_slots() {
|
||||
if (system_need_update)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
LOG_INFO("updating system prompt", {});
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
@@ -1479,12 +1606,13 @@ struct llama_server_context
|
||||
{
|
||||
if (system_prompt.empty() && clean_kv_cache)
|
||||
{
|
||||
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
|
||||
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
|
||||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("posting NEXT_RESPONSE", {});
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
@@ -1498,12 +1626,22 @@ struct llama_server_context
|
||||
{
|
||||
// Shift context
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
LOG_INFO("slot context shift", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_keep", n_keep},
|
||||
{"n_left", n_left},
|
||||
{"n_discard", n_discard},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()}
|
||||
});
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
@@ -1515,17 +1653,12 @@ struct llama_server_context
|
||||
slot.n_past -= n_discard;
|
||||
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
LOG_VERBOSE("decoding ongoing sequences", {});
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
@@ -1535,7 +1668,15 @@ struct llama_server_context
|
||||
slot.command = NONE;
|
||||
slot.t_last_used = ggml_time_us();
|
||||
|
||||
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
LOG_INFO("slot released", {
|
||||
{"slot_id", slot.id},
|
||||
{"task_id", slot.task_id},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()},
|
||||
{"truncated", slot.truncated}
|
||||
});
|
||||
queue_tasks.notify_slot_changed();
|
||||
|
||||
continue;
|
||||
@@ -1662,6 +1803,14 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
|
||||
{
|
||||
slot.n_past -= 1;
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||
|
||||
if (slot.ga_n != 1)
|
||||
@@ -1683,7 +1832,12 @@ struct llama_server_context
|
||||
slot.ga_i = ga_i;
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
LOG_INFO("slot progression", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "n_past", slot.n_past },
|
||||
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
||||
});
|
||||
}
|
||||
|
||||
slot.cache_tokens = prompt_tokens;
|
||||
@@ -1691,7 +1845,10 @@ struct llama_server_context
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id }
|
||||
});
|
||||
slot.n_past--;
|
||||
if (slot.ga_i > 0)
|
||||
{
|
||||
@@ -1699,9 +1856,13 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
LOG_INFO("kv cache rm [p0, end)", {
|
||||
{ "slot_id", slot.id },
|
||||
{ "task_id", slot.task_id },
|
||||
{ "p0", p0 }
|
||||
});
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
@@ -1736,7 +1897,13 @@ struct llama_server_context
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_TEE("failed processing images\n");
|
||||
LOG_ERROR("failed processing images", {
|
||||
"slot_id", slot.id,
|
||||
"task_id", slot.task_id,
|
||||
});
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1778,9 +1945,9 @@ struct llama_server_context
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
@@ -1836,7 +2003,7 @@ struct llama_server_context
|
||||
send_embedding(slot);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
@@ -1849,6 +2016,7 @@ struct llama_server_context
|
||||
{
|
||||
slot.t_start_genereration = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
@@ -1871,11 +2039,14 @@ struct llama_server_context
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("slots updated", {});
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1948,9 +2119,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
printf(" KV cache data type for K (default: f16)\n");
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: f16)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-format log output format: json or text (default: json)\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
@@ -2082,9 +2259,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
@@ -2208,15 +2385,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_NONE;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
}
|
||||
else if (arg_next == "layer")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
else if (arg_next == "row")
|
||||
{
|
||||
params.split_mode = LLAMA_SPLIT_ROW;
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
}
|
||||
else {
|
||||
invalid_param = true;
|
||||
@@ -2386,6 +2563,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
);
|
||||
llama.process_system_prompt_data(json::parse(systm_content));
|
||||
}
|
||||
else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
params.cache_type_k = argv[++i];
|
||||
}
|
||||
else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||
params.cache_type_v = argv[++i];
|
||||
}
|
||||
else if(arg == "--mmproj")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2395,6 +2578,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.mmproj = argv[i];
|
||||
}
|
||||
else if (arg == "--log-format")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (std::strcmp(argv[i], "json") == 0)
|
||||
{
|
||||
server_log_json = true;
|
||||
}
|
||||
else if (std::strcmp(argv[i], "text") == 0)
|
||||
{
|
||||
server_log_json = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (arg == "--log-disable")
|
||||
{
|
||||
log_set_target(stdout);
|
||||
@@ -2404,6 +2608,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
{
|
||||
sparams.slots_endpoint = false;
|
||||
}
|
||||
else if (arg == "--metrics")
|
||||
{
|
||||
sparams.metrics_endpoint = true;
|
||||
}
|
||||
else if (arg == "--chat-template")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2437,15 +2645,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
@@ -2504,32 +2712,40 @@ static json format_partial_response(
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{
|
||||
return json{
|
||||
{"tokens", tokens}};
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
}
|
||||
|
||||
static json format_detokenized_response(std::string content)
|
||||
{
|
||||
return json{
|
||||
{"content", content}};
|
||||
return json {
|
||||
{"content", content}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
||||
{
|
||||
// skip GH copilot requests when using default port
|
||||
if (req.path == "/v1/health" || req.path == "/v1/completions")
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_INFO("request", {
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
{"remote_addr", req.remote_addr},
|
||||
{"remote_port", req.remote_port},
|
||||
{"status", res.status},
|
||||
{"method", req.method},
|
||||
{"path", req.path},
|
||||
{"params", req.params},
|
||||
});
|
||||
|
||||
LOG_VERBOSE("request", {
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
{"request", req.body},
|
||||
{"response", res.body},
|
||||
});
|
||||
}
|
||||
|
||||
struct token_translator
|
||||
@@ -2611,7 +2827,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2658,7 +2874,7 @@ int main(int argc, char **argv)
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
@@ -2673,6 +2889,87 @@ int main(int argc, char **argv)
|
||||
});
|
||||
}
|
||||
|
||||
if (sparams.metrics_endpoint) {
|
||||
svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_METRICS;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
json data = result.result_json;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
|
||||
uint64_t t_prompt_processing = data["t_prompt_processing"];
|
||||
|
||||
uint64_t n_tokens_predicted = data["n_tokens_predicted"];
|
||||
uint64_t t_tokens_generation = data["t_tokens_generation"];
|
||||
|
||||
int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
|
||||
|
||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||
json all_metrics_def = json {
|
||||
{"counter", {{
|
||||
{"name", "prompt_tokens_total"},
|
||||
{"help", "Number of prompt tokens processed."},
|
||||
{"value", data["n_prompt_tokens_processed_total"]}
|
||||
}, {
|
||||
{"name", "tokens_predicted_total"},
|
||||
{"help", "Number of generation tokens processed."},
|
||||
{"value", data["n_tokens_predicted_total"]}
|
||||
}}},
|
||||
{"gauge", {{
|
||||
{"name", "prompt_tokens_seconds"},
|
||||
{"help", "Average prompt throughput in tokens/s."},
|
||||
{"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
|
||||
},{
|
||||
{"name", "predicted_tokens_seconds"},
|
||||
{"help", "Average generation throughput in tokens/s."},
|
||||
{"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
|
||||
},{
|
||||
{"name", "kv_cache_usage_ratio"},
|
||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||
{"value", 1. * kv_cache_used_cells / params.n_ctx}
|
||||
},{
|
||||
{"name", "kv_cache_tokens"},
|
||||
{"help", "KV-cache tokens."},
|
||||
{"value", data["kv_cache_tokens_count"]}
|
||||
},{
|
||||
{"name", "requests_processing"},
|
||||
{"help", "Number of request processing."},
|
||||
{"value", data["processing"]}
|
||||
},{
|
||||
{"name", "requests_deferred"},
|
||||
{"help", "Number of request deferred."},
|
||||
{"value", data["deferred"]}
|
||||
}}}
|
||||
};
|
||||
|
||||
std::stringstream prometheus;
|
||||
for (const auto& el : all_metrics_def.items()) {
|
||||
const auto& type = el.key();
|
||||
const auto& metrics_def = el.value();
|
||||
for (const auto& metric_def : metrics_def) {
|
||||
std::string name = metric_def["name"];
|
||||
std::string help = metric_def["help"];
|
||||
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||
<< "# TYPE llamacpp:" << name << " " << type << "\n"
|
||||
<< "llamacpp:" << name << " " << metric_def["value"] << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
|
||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||
@@ -2725,9 +3022,6 @@ int main(int argc, char **argv)
|
||||
// Set the base directory for serving static files
|
||||
svr.set_base_dir(sparams.public_path);
|
||||
|
||||
// to make it ctrl+clickable:
|
||||
LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
|
||||
std::unordered_map<std::string, std::string> log_data;
|
||||
log_data["hostname"] = sparams.hostname;
|
||||
log_data["port"] = std::to_string(sparams.port);
|
||||
@@ -2738,19 +3032,6 @@ int main(int argc, char **argv)
|
||||
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
|
||||
}
|
||||
|
||||
LOG_INFO("HTTP server listening", log_data);
|
||||
// run the HTTP server in a thread - see comment below
|
||||
std::thread t([&]()
|
||||
{
|
||||
if (!svr.listen_after_bind())
|
||||
{
|
||||
state.store(SERVER_STATE_ERROR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
});
|
||||
|
||||
// load the model
|
||||
if (!llama.load_model(params))
|
||||
{
|
||||
@@ -3218,6 +3499,19 @@ int main(int argc, char **argv)
|
||||
}*/
|
||||
//);
|
||||
|
||||
LOG_INFO("HTTP server listening", log_data);
|
||||
// run the HTTP server in a thread - see comment below
|
||||
std::thread t([&]()
|
||||
{
|
||||
if (!svr.listen_after_bind())
|
||||
{
|
||||
state.store(SERVER_STATE_ERROR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
});
|
||||
|
||||
llama.queue_tasks.on_new_task(std::bind(
|
||||
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
||||
llama.queue_tasks.on_finish_multitask(std::bind(
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
# Server tests
|
||||
|
||||
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
|
||||
* [issues.feature](./features/issues.feature) Pending issues scenario
|
||||
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
|
||||
* [security.feature](./features/security.feature) Security, CORS and API Key
|
||||
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
|
||||
|
||||
Tests target GitHub workflows job runners with 4 vCPU.
|
||||
|
||||
Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
|
||||
|
||||
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
|
||||
|
||||
### Install dependencies
|
||||
`pip install -r requirements.txt`
|
||||
|
||||
### Run tests
|
||||
1. Build the server
|
||||
```shell
|
||||
cd ../../..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ../
|
||||
cmake --build . --target server
|
||||
```
|
||||
2. download required models:
|
||||
1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
|
||||
3. Start the test: `./tests.sh`
|
||||
|
||||
It's possible to override some scenario steps values with environment variables:
|
||||
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
|
||||
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
|
||||
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
|
||||
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
|
||||
|
||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||
|
||||
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
|
||||
- `@bug` annotation aims to link a scenario with a GitHub issue.
|
||||
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
|
||||
- `@wip` to focus on a scenario working in progress
|
||||
|
||||
To run a scenario annotated with `@bug`, start:
|
||||
`DEBUG=ON ./tests.sh --no-skipped --tags bug`
|
||||
|
||||
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
||||
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from contextlib import closing
|
||||
from signal import SIGKILL
|
||||
|
||||
|
||||
def before_scenario(context, scenario):
|
||||
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
|
||||
port = 8080
|
||||
if 'PORT' in os.environ:
|
||||
port = int(os.environ['PORT'])
|
||||
if is_server_listening("localhost", port):
|
||||
assert False, "Server already started"
|
||||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
if context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
|
||||
|
||||
if not pid_exists(context.server_process.pid):
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
|
||||
print(f"stopping server pid={context.server_process.pid} ...")
|
||||
context.server_process.kill()
|
||||
# Wait few for socket to free up
|
||||
time.sleep(0.05)
|
||||
|
||||
attempts = 0
|
||||
while is_server_listening(context.server_fqdn, context.server_port):
|
||||
print(f"stopping server pid={context.server_process.pid} ...")
|
||||
os.kill(context.server_process.pid, SIGKILL)
|
||||
time.sleep(0.1)
|
||||
attempts += 1
|
||||
if attempts > 5:
|
||||
print(f"Server dangling exits, killing all {context.server_path} ...")
|
||||
process = subprocess.run(['killall', '-9', context.server_path],
|
||||
stderr=subprocess.PIPE,
|
||||
universal_newlines=True)
|
||||
print(process)
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
return result == 0
|
||||
|
||||
|
||||
def pid_exists(pid):
|
||||
"""Check whether pid exists in the current process table."""
|
||||
import errno
|
||||
if pid < 0:
|
||||
return False
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except OSError as e:
|
||||
return e.errno == errno.EPERM
|
||||
else:
|
||||
return True
|
||||
@@ -0,0 +1,4 @@
|
||||
# List of ongoing issues
|
||||
@bug
|
||||
Feature: Issues
|
||||
# No confirmed issue at the moment
|
||||
@@ -0,0 +1,123 @@
|
||||
@llama.cpp
|
||||
Feature: Parallel
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file stories260K.gguf
|
||||
And a model alias tinyllama-2
|
||||
And 42 as server seed
|
||||
And 64 KV cache size
|
||||
And 2 slots
|
||||
And embeddings extraction
|
||||
And continuous batching
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario Outline: Multi users completion
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| n_predict |
|
||||
| 128 |
|
||||
|
||||
Scenario Outline: Multi users OAI completions compatibility
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
And 128 max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted
|
||||
|
||||
Scenario: Multi users embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long story about AI.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another very long music lyrics.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write a very long joke.
|
||||
"""
|
||||
Given concurrent embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
|
||||
Scenario: Multi users OAI compatibility embeddings
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the biggest US city ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is the capital of Bulgaria ?
|
||||
"""
|
||||
And a model tinyllama-2
|
||||
Given concurrent OAI embedding requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all embeddings are generated
|
||||
@@ -0,0 +1,50 @@
|
||||
@llama.cpp
|
||||
Feature: Security
|
||||
|
||||
Background: Server startup with an api key defined
|
||||
Given a server listening on localhost:8080
|
||||
And a model file stories260K.gguf
|
||||
And a server api key llama.cpp
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario Outline: Completion with some user api key
|
||||
Given a prompt test
|
||||
And a user api key <api_key>
|
||||
And 4 max tokens to predict
|
||||
And a completion request with <api_error> api error
|
||||
|
||||
Examples: Prompts
|
||||
| api_key | api_error |
|
||||
| llama.cpp | no |
|
||||
| llama.cpp | no |
|
||||
| hackeme | raised |
|
||||
| | raised |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a system prompt test
|
||||
And a user prompt test
|
||||
And a model test
|
||||
And 2 max tokens to predict
|
||||
And streaming is disabled
|
||||
And a user api key <api_key>
|
||||
Given an OAI compatible chat completions request with <api_error> api error
|
||||
|
||||
Examples: Prompts
|
||||
| api_key | api_error |
|
||||
| llama.cpp | no |
|
||||
| llama.cpp | no |
|
||||
| hackme | raised |
|
||||
|
||||
|
||||
Scenario Outline: CORS Options
|
||||
When an OPTIONS request is sent from <origin>
|
||||
Then CORS header <cors_header> is set to <cors_header_value>
|
||||
|
||||
Examples: Headers
|
||||
| origin | cors_header | cors_header_value |
|
||||
| localhost | Access-Control-Allow-Origin | localhost |
|
||||
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
|
||||
| origin | Access-Control-Allow-Credentials | true |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Headers | * |
|
||||
@@ -0,0 +1,84 @@
|
||||
@llama.cpp
|
||||
Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file stories260K.gguf
|
||||
And a model alias tinyllama-2
|
||||
And 42 as server seed
|
||||
# KV Cache corresponds to the total amount of tokens
|
||||
# that can be stored across all independent sequences: #4130
|
||||
# see --ctx-size and #5568
|
||||
And 32 KV cache size
|
||||
And 1 slots
|
||||
And embeddings extraction
|
||||
And 32 server max tokens to predict
|
||||
And prometheus compatible metrics exposed
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Scenario: Health
|
||||
Then the server is ready
|
||||
And all slots are idle
|
||||
|
||||
Scenario Outline: Completion
|
||||
Given a prompt <prompt>
|
||||
And <n_predict> max tokens to predict
|
||||
And a completion request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And prometheus metrics are exposed
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a model <model>
|
||||
And a system prompt <system_prompt>
|
||||
And a user prompt <user_prompt>
|
||||
And <max_tokens> max tokens to predict
|
||||
And streaming is <enable_streaming>
|
||||
Given an OAI compatible chat completions request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
|
||||
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled |
|
||||
|
||||
Scenario: Embedding
|
||||
When embeddings are computed for:
|
||||
"""
|
||||
What is the capital of Bulgaria ?
|
||||
"""
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: OAI Embeddings compatibility
|
||||
Given a model tinyllama-2
|
||||
When an OAI compatible embeddings computation request for:
|
||||
"""
|
||||
What is the capital of Spain ?
|
||||
"""
|
||||
Then embeddings are generated
|
||||
|
||||
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||
Given a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
In which country Paris is located ?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Is Madrid the capital of Spain ?
|
||||
"""
|
||||
When an OAI compatible embeddings computation request for multiple inputs
|
||||
Then embeddings are generated
|
||||
|
||||
|
||||
Scenario: Tokenize / Detokenize
|
||||
When tokenizing:
|
||||
"""
|
||||
What is the capital of France ?
|
||||
"""
|
||||
Then tokens can be detokenize
|
||||
@@ -0,0 +1,803 @@
|
||||
import asyncio
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from contextlib import closing
|
||||
from re import RegexFlag
|
||||
|
||||
import aiohttp
|
||||
import openai
|
||||
from behave import step
|
||||
from behave.api.async_step import async_run_until_complete
|
||||
from prometheus_client import parser
|
||||
|
||||
|
||||
@step(u"a server listening on {server_fqdn}:{server_port}")
|
||||
def step_server_config(context, server_fqdn, server_port):
|
||||
context.server_fqdn = server_fqdn
|
||||
context.server_port = int(server_port)
|
||||
if 'PORT' in os.environ:
|
||||
context.server_port = int(os.environ['PORT'])
|
||||
print(f"$PORT set, overriding server port with to {context.server_port}")
|
||||
|
||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||
|
||||
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
|
||||
context.model_alias = None
|
||||
context.n_ctx = None
|
||||
context.n_predict = None
|
||||
context.n_server_predict = None
|
||||
context.n_slots = None
|
||||
context.server_api_key = None
|
||||
context.server_continuous_batching = False
|
||||
context.server_embeddings = False
|
||||
context.server_metrics = False
|
||||
context.server_process = None
|
||||
context.server_seed = None
|
||||
context.user_api_key = None
|
||||
|
||||
context.tasks_result = []
|
||||
context.concurrent_tasks = []
|
||||
context.prompts = []
|
||||
|
||||
|
||||
@step(u'a model file {model_file}')
|
||||
def step_model_file(context, model_file):
|
||||
context.model_file = model_file
|
||||
|
||||
|
||||
@step(u'a model alias {model_alias}')
|
||||
def step_model_alias(context, model_alias):
|
||||
context.model_alias = model_alias
|
||||
|
||||
|
||||
@step(u'{seed} as server seed')
|
||||
def step_seed(context, seed):
|
||||
context.server_seed = int(seed)
|
||||
|
||||
|
||||
@step(u'{n_ctx} KV cache size')
|
||||
def step_n_ctx(context, n_ctx):
|
||||
context.n_ctx = int(n_ctx)
|
||||
|
||||
|
||||
@step(u'{n_slots} slots')
|
||||
def step_n_slots(context, n_slots):
|
||||
context.n_slots = int(n_slots)
|
||||
|
||||
|
||||
@step(u'{n_predict} server max tokens to predict')
|
||||
def step_server_n_predict(context, n_predict):
|
||||
context.n_server_predict = int(n_predict)
|
||||
|
||||
|
||||
@step(u'continuous batching')
|
||||
def step_server_continuous_batching(context):
|
||||
context.server_continuous_batching = True
|
||||
|
||||
|
||||
@step(u'embeddings extraction')
|
||||
def step_server_embeddings(context):
|
||||
context.server_embeddings = True
|
||||
|
||||
|
||||
@step(u'prometheus compatible metrics exposed')
|
||||
def step_server_metrics(context):
|
||||
context.server_metrics = True
|
||||
|
||||
|
||||
@step(u"the server is starting")
|
||||
def step_start_server(context):
|
||||
start_server_background(context)
|
||||
attempts = 0
|
||||
while True:
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((context.server_fqdn, context.server_port))
|
||||
if result == 0:
|
||||
print("\x1b[33;46mserver started!\x1b[0m")
|
||||
return
|
||||
attempts += 1
|
||||
if attempts > 20:
|
||||
assert False, "server not started"
|
||||
print(f"waiting for server to start, connect error code = {result}...")
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
@step(u"the server is {expecting_status}")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok')
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
params={'fail_on_no_slot': 0, 'include_slots': 0},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0,
|
||||
expected_slots=[{'id': slot_id, 'state': 0}
|
||||
for slot_id in range(context.n_slots)])
|
||||
case 'busy':
|
||||
await wait_for_health_status(context, context.base_url, 503,
|
||||
'no slot available',
|
||||
params={'fail_on_no_slot': 0, 'include_slots': 0},
|
||||
slots_idle=0,
|
||||
slots_processing=context.n_slots,
|
||||
expected_slots=[{'id': slot_id, 'state': 1}
|
||||
for slot_id in range(context.n_slots)])
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
|
||||
@step(u'all slots are {expected_slot_status_string}')
|
||||
@async_run_until_complete
|
||||
async def step_all_slots_status(context, expected_slot_status_string):
|
||||
match expected_slot_status_string:
|
||||
case 'idle':
|
||||
expected_slot_status = 0
|
||||
case 'busy':
|
||||
expected_slot_status = 1
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
|
||||
for slot_id in range(context.n_slots)]
|
||||
await request_slots_status(context, expected_slots)
|
||||
|
||||
|
||||
@step(u'a completion request with {api_error} api error')
|
||||
@async_run_until_complete
|
||||
async def step_request_completion(context, api_error):
|
||||
expect_api_error = api_error == 'raised'
|
||||
completion = await request_completion(context.prompts.pop(),
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict,
|
||||
server_seed=context.server_seed,
|
||||
expect_api_error=expect_api_error,
|
||||
user_api_key=context.user_api_key)
|
||||
context.tasks_result.append(completion)
|
||||
if context.debug:
|
||||
print(f"Completion response: {completion}")
|
||||
if expect_api_error:
|
||||
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
||||
|
||||
|
||||
@step(u'{predicted_n} tokens are predicted matching {re_content}')
|
||||
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
|
||||
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content)
|
||||
|
||||
|
||||
@step(u'{predicted_n} tokens are predicted')
|
||||
def step_n_tokens_predicted(context, predicted_n):
|
||||
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n))
|
||||
|
||||
|
||||
@step(u'a user prompt {user_prompt}')
|
||||
def step_user_prompt(context, user_prompt):
|
||||
context.prompts.append(user_prompt)
|
||||
|
||||
|
||||
@step(u'a system prompt {system_prompt}')
|
||||
def step_system_prompt(context, system_prompt):
|
||||
context.system_prompt = system_prompt
|
||||
|
||||
|
||||
@step(u'a model {model}')
|
||||
def step_model(context, model):
|
||||
context.model = model
|
||||
|
||||
|
||||
@step(u'{max_tokens} max tokens to predict')
|
||||
def step_max_tokens(context, max_tokens):
|
||||
context.n_predict = int(max_tokens)
|
||||
|
||||
|
||||
@step(u'streaming is {enable_streaming}')
|
||||
def step_streaming(context, enable_streaming):
|
||||
context.enable_streaming = enable_streaming == 'enabled'
|
||||
|
||||
|
||||
@step(u'a user api key {user_api_key}')
|
||||
def step_user_api_key(context, user_api_key):
|
||||
context.user_api_key = user_api_key
|
||||
|
||||
|
||||
@step(u'no user api key')
|
||||
def step_no_user_api_key(context):
|
||||
context.user_api_key = None
|
||||
|
||||
|
||||
@step(u'a user api key ')
|
||||
def step_no_user_api_key_space(context):
|
||||
context.user_api_key = None
|
||||
|
||||
|
||||
@step(u'a server api key {server_api_key}')
|
||||
def step_server_api_key(context, server_api_key):
|
||||
context.server_api_key = server_api_key
|
||||
|
||||
|
||||
@step(u'an OAI compatible chat completions request with {api_error} api error')
|
||||
@async_run_until_complete
|
||||
async def step_oai_chat_completions(context, api_error):
|
||||
if context.debug:
|
||||
print(f"Submitting OAI compatible completions request...")
|
||||
expect_api_error = api_error == 'raised'
|
||||
completion = await oai_chat_completions(context.prompts.pop(),
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
False,
|
||||
model=context.model if hasattr(context, 'model') else None,
|
||||
|
||||
n_predict=context.n_predict
|
||||
if hasattr(context, 'n_predict') else None,
|
||||
|
||||
enable_streaming=context.enable_streaming
|
||||
if hasattr(context, 'enable_streaming') else None,
|
||||
|
||||
server_seed=context.server_seed
|
||||
if hasattr(context, 'server_seed') else None,
|
||||
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None,
|
||||
|
||||
expect_api_error=expect_api_error)
|
||||
context.tasks_result.append(completion)
|
||||
if context.debug:
|
||||
print(f"Completion response: {completion}")
|
||||
if expect_api_error:
|
||||
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
||||
|
||||
if context.debug:
|
||||
print(f"Completion response: {completion}")
|
||||
|
||||
|
||||
@step(u'a prompt')
|
||||
def step_a_prompt(context):
|
||||
context.prompts.append(context.text)
|
||||
|
||||
|
||||
@step(u'a prompt {prompt}')
|
||||
def step_a_prompt_prompt(context, prompt):
|
||||
context.prompts.append(prompt)
|
||||
|
||||
|
||||
@step(u'concurrent completion requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_completion_requests(context):
|
||||
await concurrent_requests(context,
|
||||
request_completion,
|
||||
# prompt is inserted automatically
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
|
||||
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key if hasattr(context,
|
||||
'user_api_key') else None)
|
||||
|
||||
|
||||
@step(u'concurrent OAI completions requests')
|
||||
@async_run_until_complete
|
||||
async def step_oai_chat_completions(context):
|
||||
await concurrent_requests(context, oai_chat_completions,
|
||||
# user_prompt is inserted automatically
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
True, # async_client
|
||||
model=context.model
|
||||
if hasattr(context, 'model') else None,
|
||||
n_predict=context.n_predict
|
||||
if hasattr(context, 'n_predict') else None,
|
||||
enable_streaming=context.enable_streaming
|
||||
if hasattr(context, 'enable_streaming') else None,
|
||||
server_seed=context.server_seed
|
||||
if hasattr(context, 'server_seed') else None,
|
||||
user_api_key=context.user_api_key
|
||||
if hasattr(context, 'user_api_key') else None)
|
||||
|
||||
|
||||
@step(u'all prompts are predicted')
|
||||
@async_run_until_complete
|
||||
async def step_all_prompts_are_predicted(context):
|
||||
await all_prompts_are_predicted(context)
|
||||
|
||||
|
||||
@step(u'all prompts are predicted with {n_predict} tokens')
|
||||
@async_run_until_complete
|
||||
async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict):
|
||||
expected_predicted_n = int(n_predict)
|
||||
await all_prompts_are_predicted(context, expected_predicted_n)
|
||||
|
||||
|
||||
async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
||||
n_completions = await gather_tasks_results(context)
|
||||
assert n_completions > 0
|
||||
for i in range(n_completions):
|
||||
assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
|
||||
assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
|
||||
|
||||
|
||||
@step(u'embeddings are computed for')
|
||||
@async_run_until_complete
|
||||
async def step_compute_embedding(context):
|
||||
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
||||
|
||||
|
||||
@step(u'embeddings are generated')
|
||||
def step_assert_embeddings(context):
|
||||
if len(context.prompts) == 0:
|
||||
assert_embeddings(context.embeddings)
|
||||
else:
|
||||
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
|
||||
f"context.prompts={context.prompts}\n"
|
||||
f"context.embeddings={context.embeddings}")
|
||||
for embedding in context.embeddings:
|
||||
context.prompts.pop()
|
||||
assert_embeddings(embedding)
|
||||
|
||||
|
||||
@step(u'an OAI compatible embeddings computation request for')
|
||||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings(context):
|
||||
context.embeddings = await request_oai_embeddings(context.text,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'an OAI compatible embeddings computation request for multiple inputs')
|
||||
@async_run_until_complete
|
||||
async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||
context.embeddings = await request_oai_embeddings(context.prompts,
|
||||
base_url=context.base_url,
|
||||
user_api_key=context.user_api_key,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'concurrent embedding requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_embedding_requests(context):
|
||||
await concurrent_requests(context,
|
||||
request_embedding,
|
||||
# prompt is inserted automatically
|
||||
base_url=context.base_url)
|
||||
|
||||
|
||||
@step(u'concurrent OAI embedding requests')
|
||||
@async_run_until_complete()
|
||||
async def step_concurrent_oai_embedding_requests(context):
|
||||
await concurrent_requests(context,
|
||||
request_oai_embeddings,
|
||||
# prompt is inserted automatically
|
||||
base_url=context.base_url,
|
||||
async_client=True,
|
||||
model=context.model)
|
||||
|
||||
|
||||
@step(u'all embeddings are generated')
|
||||
@async_run_until_complete()
|
||||
async def all_embeddings_are_generated(context):
|
||||
n_embedding_requests = await gather_tasks_results(context)
|
||||
assert n_embedding_requests > 0
|
||||
for i in range(n_embedding_requests):
|
||||
assert_embeddings(context.tasks_result.pop())
|
||||
|
||||
|
||||
@step(u'tokenizing')
|
||||
@async_run_until_complete
|
||||
async def step_tokenize(context):
|
||||
context.tokenized_text = context.text
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{context.base_url}/tokenize',
|
||||
json={
|
||||
"content": context.tokenized_text,
|
||||
}) as response:
|
||||
assert response.status == 200
|
||||
tokenize_json = await response.json()
|
||||
context.tokens = tokenize_json['tokens']
|
||||
|
||||
|
||||
@step(u'tokens can be detokenize')
|
||||
@async_run_until_complete
|
||||
async def step_detokenize(context):
|
||||
assert len(context.tokens) > 0
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{context.base_url}/detokenize',
|
||||
json={
|
||||
"tokens": context.tokens,
|
||||
}) as response:
|
||||
assert response.status == 200
|
||||
detokenize_json = await response.json()
|
||||
# SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
|
||||
assert context.tokenized_text == detokenize_json['content'].strip()
|
||||
|
||||
|
||||
@step(u'an OPTIONS request is sent from {origin}')
|
||||
@async_run_until_complete
|
||||
async def step_options_request(context, origin):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.options(f'{context.base_url}/v1/chat/completions',
|
||||
headers={"Origin": origin}) as response:
|
||||
assert response.status == 200
|
||||
context.options_response = response
|
||||
|
||||
|
||||
@step(u'CORS header {cors_header} is set to {cors_header_value}')
|
||||
def step_check_options_header_value(context, cors_header, cors_header_value):
|
||||
assert context.options_response.headers[cors_header] == cors_header_value
|
||||
|
||||
|
||||
@step(u'prometheus metrics are exposed')
|
||||
@async_run_until_complete
|
||||
async def step_prometheus_metrics_exported(context):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
||||
assert metrics_response.status == 200
|
||||
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
||||
metrics_raw = await metrics_response.text()
|
||||
metric_exported = False
|
||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||
match metric.name:
|
||||
case "llamacpp:kv_cache_usage_ratio":
|
||||
assert len(metric.samples) > 0
|
||||
metric_exported = True
|
||||
assert metric_exported, "No metrics exported"
|
||||
|
||||
|
||||
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||
n_prompts = len(context.prompts)
|
||||
if context.debug:
|
||||
print(f"starting {n_prompts} concurrent completion requests...")
|
||||
assert n_prompts > 0
|
||||
for prompt_no in range(n_prompts):
|
||||
shifted_args = [context.prompts.pop(), *args]
|
||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
async def request_completion(prompt,
|
||||
base_url,
|
||||
debug=False,
|
||||
n_predict=None,
|
||||
server_seed=None,
|
||||
expect_api_error=None,
|
||||
user_api_key=None):
|
||||
if debug:
|
||||
print(f"Sending completion request: {prompt}")
|
||||
origin = "my.super.domain"
|
||||
headers = {
|
||||
'Origin': origin
|
||||
}
|
||||
if user_api_key is not None:
|
||||
if debug:
|
||||
print(f"Set user_api_key: {user_api_key}")
|
||||
headers['Authorization'] = f'Bearer {user_api_key}'
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/completion',
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"n_predict": int(n_predict) if n_predict is not None else -1,
|
||||
"seed": server_seed if server_seed is not None else 42
|
||||
},
|
||||
headers=headers) as response:
|
||||
if expect_api_error is None or not expect_api_error:
|
||||
assert response.status == 200
|
||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||
return await response.json()
|
||||
else:
|
||||
return response.status
|
||||
|
||||
|
||||
async def oai_chat_completions(user_prompt,
|
||||
system_prompt,
|
||||
base_url,
|
||||
async_client,
|
||||
debug=False,
|
||||
model=None,
|
||||
n_predict=None,
|
||||
enable_streaming=None,
|
||||
server_seed=None,
|
||||
user_api_key=None,
|
||||
expect_api_error=None):
|
||||
if debug:
|
||||
print(f"Sending OAI Chat completions request: {user_prompt}")
|
||||
# openai client always expects an api key
|
||||
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||
seed = server_seed if server_seed is not None else 42
|
||||
enable_streaming = enable_streaming if enable_streaming is not None else False
|
||||
payload = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_prompt,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt,
|
||||
}
|
||||
],
|
||||
"model": model,
|
||||
"max_tokens": n_predict,
|
||||
"stream": enable_streaming,
|
||||
"seed": seed
|
||||
}
|
||||
completion_response = {
|
||||
'content': '',
|
||||
'timings': {
|
||||
'predicted_n': 0
|
||||
}
|
||||
}
|
||||
if async_client:
|
||||
origin = 'llama.cpp'
|
||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/v1/chat/completions',
|
||||
json=payload,
|
||||
headers=headers) as response:
|
||||
if enable_streaming:
|
||||
assert response.status == 200
|
||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||
assert response.headers['Content-Type'] == "text/event-stream"
|
||||
event_received = True
|
||||
while event_received:
|
||||
event_received = False
|
||||
async for line_in_bytes in response.content:
|
||||
line = line_in_bytes.decode('utf8')
|
||||
line = line.rstrip('\n').rstrip('\r')
|
||||
if line == '':
|
||||
continue
|
||||
event_data = line.split(': ', 1)
|
||||
assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
|
||||
chunk_raw = event_data[1]
|
||||
|
||||
chunk = json.loads(chunk_raw)
|
||||
assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
|
||||
delta = chunk['choices'][0]['delta']
|
||||
if 'content' in delta:
|
||||
completion_response['content'] += delta['content']
|
||||
completion_response['timings']['predicted_n'] += 1
|
||||
else:
|
||||
if expect_api_error is None or not expect_api_error:
|
||||
assert response.status == 200
|
||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||
chat_completion_raw = await response.json()
|
||||
completion_response = {
|
||||
'content': chat_completion_raw['choices'][0]['message'],
|
||||
'timings': {
|
||||
'predicted_n': chat_completion_raw['usage']['completion_tokens']
|
||||
}
|
||||
}
|
||||
else:
|
||||
return response.status
|
||||
else:
|
||||
try:
|
||||
openai.api_key = user_api_key
|
||||
openai.api_base = f'{base_url}/v1/chat'
|
||||
chat_completion = openai.Completion.create(
|
||||
messages=payload['messages'],
|
||||
model=model,
|
||||
max_tokens=n_predict,
|
||||
stream=enable_streaming,
|
||||
seed=seed
|
||||
)
|
||||
except openai.error.APIError as e:
|
||||
if expect_api_error is not None and expect_api_error:
|
||||
return 401
|
||||
else:
|
||||
assert False, f'error raised: {e}'
|
||||
|
||||
if enable_streaming:
|
||||
for chunk in chat_completion:
|
||||
assert len(chunk.choices) == 1
|
||||
delta = chunk.choices[0].delta
|
||||
if 'content' in delta:
|
||||
completion_response['content'] += delta['content']
|
||||
completion_response['timings']['predicted_n'] += 1
|
||||
else:
|
||||
assert len(chat_completion.choices) == 1
|
||||
completion_response = {
|
||||
'content': chat_completion.choices[0].message.content,
|
||||
'timings': {
|
||||
'predicted_n': chat_completion.usage.completion_tokens
|
||||
}
|
||||
}
|
||||
if debug:
|
||||
print("OAI response formatted to llama.cpp:", completion_response)
|
||||
return completion_response
|
||||
|
||||
|
||||
async def request_embedding(content, base_url=None):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/embedding',
|
||||
json={
|
||||
"content": content,
|
||||
}) as response:
|
||||
assert response.status == 200
|
||||
response_json = await response.json()
|
||||
return response_json['embedding']
|
||||
|
||||
|
||||
async def request_oai_embeddings(input,
|
||||
base_url=None, user_api_key=None,
|
||||
model=None, async_client=False):
|
||||
# openai client always expects an api_key
|
||||
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||
if async_client:
|
||||
origin = 'llama.cpp'
|
||||
if user_api_key is not None:
|
||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(f'{base_url}/v1/embeddings',
|
||||
json={
|
||||
"input": input,
|
||||
"model": model,
|
||||
},
|
||||
headers=headers) as response:
|
||||
assert response.status == 200, f"received status code not expected: {response.status}"
|
||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||
response_json = await response.json()
|
||||
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
||||
assert response_json['object'] == 'list'
|
||||
return response_json['data']
|
||||
else:
|
||||
openai.api_key = user_api_key
|
||||
openai.api_base = f'{base_url}/v1'
|
||||
oai_embeddings = openai.Embedding.create(
|
||||
model=model,
|
||||
input=input,
|
||||
)
|
||||
|
||||
if isinstance(input, collections.abc.Sequence):
|
||||
embeddings = []
|
||||
for an_oai_embeddings in oai_embeddings.data:
|
||||
embeddings.append(an_oai_embeddings.embedding)
|
||||
else:
|
||||
embeddings = oai_embeddings.data.embedding
|
||||
return embeddings
|
||||
|
||||
|
||||
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
|
||||
content = completion_response['content']
|
||||
n_predicted = completion_response['timings']['predicted_n']
|
||||
assert len(content) > 0, "no token predicted"
|
||||
if expected_predicted_n is not None:
|
||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||
f' {n_predicted} <> {expected_predicted_n}')
|
||||
if re_content is not None:
|
||||
re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
|
||||
assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
|
||||
f'invalid tokens predicted:'
|
||||
f' ```\n{content}\n``` do not match /{re_content}/')
|
||||
|
||||
|
||||
async def gather_tasks_results(context):
|
||||
n_tasks = len(context.concurrent_tasks)
|
||||
if context.debug:
|
||||
print(f"Waiting for all {n_tasks} tasks results...")
|
||||
for task_no in range(n_tasks):
|
||||
context.tasks_result.append(await context.concurrent_tasks.pop())
|
||||
n_completions = len(context.tasks_result)
|
||||
return n_completions
|
||||
|
||||
|
||||
async def wait_for_health_status(context,
|
||||
base_url,
|
||||
expected_http_status_code,
|
||||
expected_health_status,
|
||||
params=None,
|
||||
slots_idle=None,
|
||||
slots_processing=None,
|
||||
expected_slots=None):
|
||||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
timeout = 3 # seconds
|
||||
if expected_health_status == 'ok':
|
||||
timeout = 10 # CI slow inference
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
async with aiohttp.ClientSession() as session:
|
||||
while True:
|
||||
async with await session.get(f'{base_url}/health', params=params) as health_response:
|
||||
status_code = health_response.status
|
||||
health = await health_response.json()
|
||||
if context.debug:
|
||||
print(f"HEALTH - response for expected health status='{expected_health_status}' on "
|
||||
f"'{base_url}/health'?{params} is {health}")
|
||||
if (status_code == expected_http_status_code
|
||||
and health['status'] == expected_health_status
|
||||
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||
if expected_slots is not None:
|
||||
assert_slots_status(health['slots'], expected_slots)
|
||||
return
|
||||
if (status_code == expected_http_status_code
|
||||
and health['status'] == expected_health_status
|
||||
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||
if expected_slots is not None:
|
||||
assert_slots_status(health['slots'], expected_slots)
|
||||
return
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
counter += interval
|
||||
if counter >= timeout:
|
||||
# Sometimes health requests are triggered after completions are predicted
|
||||
if expected_http_status_code == 503:
|
||||
if len(context.tasks_result) == 0:
|
||||
print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
|
||||
" busy health check missed, probably too fast inference\x1b[0m")
|
||||
n_completions = await gather_tasks_results(context)
|
||||
if n_completions > 0:
|
||||
return
|
||||
|
||||
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||
|
||||
|
||||
def assert_embeddings(embeddings):
|
||||
assert len(embeddings) > 0
|
||||
embeddings_computed = False
|
||||
for emb in embeddings:
|
||||
if emb != 0:
|
||||
embeddings_computed = True
|
||||
assert embeddings_computed, f"Embeddings: {embeddings}"
|
||||
|
||||
|
||||
async def request_slots_status(context, expected_slots):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with await session.get(f'{context.base_url}/slots') as slots_response:
|
||||
assert slots_response.status == 200
|
||||
slots = await slots_response.json()
|
||||
assert_slots_status(slots, expected_slots)
|
||||
|
||||
|
||||
def assert_slots_status(slots, expected_slots):
|
||||
assert len(slots) == len(expected_slots)
|
||||
for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
|
||||
for key in expected:
|
||||
assert expected[key] == slot[key], (f"invalid slot {slot_id}"
|
||||
f" expected[{key}] != slot[{key}]"
|
||||
f" = {expected[key]} != {slot[key]}")
|
||||
|
||||
|
||||
def start_server_background(context):
|
||||
context.server_path = '../../../build/bin/server'
|
||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||
server_args = [
|
||||
'--host', context.server_fqdn,
|
||||
'--port', context.server_port,
|
||||
'--model', context.model_file
|
||||
]
|
||||
if context.server_continuous_batching:
|
||||
server_args.append('--cont-batching')
|
||||
if context.server_embeddings:
|
||||
server_args.append('--embedding')
|
||||
if context.server_metrics:
|
||||
server_args.append('--metrics')
|
||||
if context.model_alias is not None:
|
||||
server_args.extend(['--alias', context.model_alias])
|
||||
if context.n_ctx is not None:
|
||||
server_args.extend(['--ctx-size', context.n_ctx])
|
||||
if context.n_slots is not None:
|
||||
server_args.extend(['--parallel', context.n_slots])
|
||||
if context.n_server_predict is not None:
|
||||
server_args.extend(['--n-predict', context.n_server_predict])
|
||||
if context.server_api_key is not None:
|
||||
server_args.extend(['--api-key', context.server_api_key])
|
||||
if context.debug:
|
||||
server_args.append('--verbose')
|
||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||
server_args.extend(['--log-format', "text"])
|
||||
print(f"starting server with: {context.server_path}", *server_args)
|
||||
context.server_process = subprocess.Popen(
|
||||
[str(arg) for arg in [context.server_path, *server_args]],
|
||||
close_fds=True)
|
||||
print(f"server pid={context.server_process.pid}")
|
||||
@@ -0,0 +1,21 @@
|
||||
# run with ./test.sh --tags wrong_usage
|
||||
@wrong_usage
|
||||
Feature: Wrong usage of llama.cpp server
|
||||
|
||||
#3969 The user must always set --n-predict option
|
||||
# to cap the number of tokens any completion request can generate
|
||||
# or pass n_predict/max_tokens in the request.
|
||||
Scenario: Infinite loop
|
||||
Given a server listening on localhost:8080
|
||||
And a model file stories260K.gguf
|
||||
# Uncomment below to fix the issue
|
||||
#And 64 server max tokens to predict
|
||||
Then the server is starting
|
||||
Given a prompt:
|
||||
"""
|
||||
Go to: infinite loop
|
||||
"""
|
||||
# Uncomment below to fix the issue
|
||||
#And 128 max tokens to predict
|
||||
Given concurrent completion requests
|
||||
Then all prompts are predicted
|
||||
@@ -0,0 +1,4 @@
|
||||
aiohttp~=3.9.3
|
||||
behave~=1.2.6
|
||||
openai~=0.25.0
|
||||
prometheus-client~=0.20.0
|
||||
Executable
+12
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
# Start @llama.cpp scenario
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp
|
||||
else
|
||||
behave "$@"
|
||||
fi
|
||||
|
||||
+55
-29
@@ -14,6 +14,7 @@
|
||||
using json = nlohmann::json;
|
||||
|
||||
extern bool server_verbose;
|
||||
extern bool server_log_json;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
@@ -27,14 +28,14 @@ extern bool server_verbose;
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// parallel
|
||||
@@ -50,7 +51,7 @@ enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE,
|
||||
TASK_TYPE_SLOTS_DATA
|
||||
TASK_TYPE_METRICS
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
@@ -133,26 +134,48 @@ struct completion_token_output
|
||||
std::string text_to_send;
|
||||
};
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line,
|
||||
const char *message, const nlohmann::ordered_json &extra)
|
||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
|
||||
{
|
||||
nlohmann::ordered_json log
|
||||
{
|
||||
std::stringstream ss_tid;
|
||||
ss_tid << std::this_thread::get_id();
|
||||
json log = nlohmann::ordered_json{
|
||||
{"tid", ss_tid.str()},
|
||||
{"timestamp", time(nullptr)},
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"message", message},
|
||||
};
|
||||
|
||||
if (!extra.empty())
|
||||
{
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
if (server_log_json) {
|
||||
log.merge_patch(
|
||||
{
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"msg", message},
|
||||
});
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
|
||||
} else {
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << buf << " |";
|
||||
for (const auto& el : log.items())
|
||||
{
|
||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
|
||||
ss << buf;
|
||||
}
|
||||
|
||||
const std::string str = ss.str();
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
@@ -234,6 +257,7 @@ struct llama_server_queue {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", task.id}});
|
||||
}
|
||||
queue_tasks.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
@@ -249,7 +273,9 @@ struct llama_server_queue {
|
||||
// Get the next id for creating anew task
|
||||
int get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return id++;
|
||||
int new_id = id++;
|
||||
LOG_VERBOSE("new task id", {{"new_id", new_id}});
|
||||
return new_id;
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
@@ -290,8 +316,7 @@ struct llama_server_queue {
|
||||
void start_loop() {
|
||||
running = true;
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
LOG_VERBOSE("new task may arrive", {});
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
@@ -303,7 +328,7 @@ struct llama_server_queue {
|
||||
task_server task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {});
|
||||
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
||||
callback_new_task(task);
|
||||
}
|
||||
LOG_VERBOSE("callback_all_task_finished", {});
|
||||
@@ -384,11 +409,13 @@ struct llama_server_response {
|
||||
std::condition_variable condition_results;
|
||||
|
||||
void add_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(task_id);
|
||||
}
|
||||
|
||||
void remove_waiting_task_id(int task_id) {
|
||||
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(task_id);
|
||||
}
|
||||
@@ -401,7 +428,6 @@ struct llama_server_response {
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
LOG_VERBOSE("condition_results unblock", {});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
@@ -426,22 +452,22 @@ struct llama_server_response {
|
||||
// Send a new result to a waiting task_id
|
||||
void send(task_result result) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
LOG_VERBOSE("send new result", {});
|
||||
LOG_VERBOSE("send new result", {{"task_id", result.id}});
|
||||
for (auto& task_id : waiting_task_ids) {
|
||||
// LOG_TEE("waiting task id %i \n", task_id);
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (result.multitask_id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("callback_update_multitask", {});
|
||||
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
|
||||
callback_update_multitask(task_id, result.id, result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("queue_results.push_back", {});
|
||||
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
|
||||
queue_results.push_back(result);
|
||||
condition_results.notify_one();
|
||||
condition_results.notify_all();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -960,7 +960,7 @@ int main(int argc, char ** argv) {
|
||||
struct ggml_opt_context * opt = train->opt;
|
||||
|
||||
// set opt params from command line
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1708118438,
|
||||
"narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
|
||||
"lastModified": 1708655239,
|
||||
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
|
||||
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
+603
-79
File diff suppressed because it is too large
Load Diff
+20
-7
@@ -53,11 +53,23 @@ extern "C" {
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_FP32_TO_FP16(x) (x)
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
__fp16 tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
__fp16 tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
||||
|
||||
+62
-12
@@ -61,6 +61,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
@@ -85,6 +87,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
@@ -105,6 +109,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
@@ -122,6 +128,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
@@ -139,6 +147,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
@@ -452,6 +462,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
@@ -476,6 +488,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
@@ -496,6 +510,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -513,6 +529,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
@@ -530,6 +548,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
@@ -1347,6 +1367,8 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
@@ -1483,6 +1505,18 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1527,9 +1561,9 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||
@@ -1537,8 +1571,8 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1640,6 +1674,8 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
@@ -1779,6 +1815,18 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
@@ -1839,9 +1887,9 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
||||
}
|
||||
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
|
||||
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
|
||||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
|
||||
src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
||||
@@ -1849,8 +1897,8 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1900,6 +1948,8 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
@@ -2237,8 +2287,8 @@ static bool ggml_metal_graph_compute(
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (order) {
|
||||
case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||
case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
||||
|
||||
@@ -2519,12 +2519,34 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
||||
|
||||
// 2.5625 bpw quants
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_s;
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
} block_iq3_xxs;
|
||||
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[IQ3S_N_SCALE];
|
||||
} block_iq3_s;
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/8];
|
||||
@@ -3760,6 +3782,265 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
||||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint64_t iq2s_grid[1024] = {
|
||||
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
||||
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
||||
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
||||
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
||||
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
||||
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
|
||||
0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
|
||||
0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
|
||||
0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
|
||||
0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
|
||||
0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
|
||||
0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
|
||||
0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
|
||||
0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
|
||||
0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
|
||||
0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
|
||||
0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
|
||||
0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
|
||||
0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
|
||||
0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
|
||||
0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
|
||||
0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
|
||||
0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
|
||||
0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
|
||||
0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
|
||||
0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
|
||||
0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
|
||||
0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
|
||||
0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
|
||||
0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
|
||||
0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
|
||||
0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
|
||||
0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
|
||||
0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
|
||||
0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
|
||||
0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
|
||||
0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
|
||||
0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
|
||||
0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
|
||||
0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
|
||||
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
|
||||
0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
|
||||
0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
|
||||
0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
|
||||
0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
|
||||
0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
|
||||
0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
|
||||
0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
|
||||
0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
|
||||
0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
|
||||
0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
|
||||
0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
|
||||
0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
|
||||
0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
|
||||
0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
|
||||
0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
|
||||
0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
|
||||
0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
|
||||
0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
|
||||
0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
|
||||
0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
|
||||
0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
|
||||
0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
|
||||
0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
|
||||
0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
|
||||
0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
|
||||
0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
|
||||
0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
|
||||
0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
|
||||
0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
|
||||
0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
|
||||
0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
|
||||
0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
|
||||
0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
|
||||
0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
|
||||
0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
|
||||
0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
|
||||
0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
|
||||
0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
|
||||
0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
|
||||
0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
|
||||
0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
|
||||
0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
|
||||
0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
|
||||
0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
|
||||
0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
|
||||
0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
|
||||
0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
|
||||
0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
|
||||
0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
|
||||
0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
|
||||
0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
|
||||
0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
|
||||
0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
|
||||
0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
|
||||
0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
|
||||
0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
|
||||
0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
|
||||
0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
|
||||
0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
|
||||
0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
|
||||
0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
|
||||
0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
|
||||
0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
|
||||
0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
|
||||
0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
|
||||
0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
|
||||
0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
|
||||
0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
|
||||
0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
|
||||
0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
|
||||
0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
|
||||
0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
|
||||
0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
|
||||
0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
|
||||
0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
|
||||
0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
|
||||
0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
|
||||
0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
|
||||
0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
|
||||
0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
|
||||
0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
|
||||
0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
|
||||
0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
|
||||
0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
|
||||
0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
|
||||
0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
|
||||
0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
|
||||
0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
|
||||
0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
|
||||
0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
|
||||
0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
|
||||
0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
|
||||
0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
|
||||
0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
|
||||
0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
|
||||
0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
|
||||
0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
|
||||
0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
|
||||
0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
|
||||
0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
|
||||
0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
|
||||
0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
|
||||
0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
|
||||
0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
|
||||
0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
|
||||
0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
|
||||
0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
|
||||
0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
|
||||
0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
|
||||
0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
|
||||
0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
|
||||
0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
|
||||
0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
|
||||
0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
|
||||
0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
|
||||
0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
|
||||
0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
|
||||
0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
|
||||
0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
|
||||
0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
|
||||
0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
|
||||
0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
|
||||
0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
|
||||
0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
|
||||
0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
|
||||
0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
|
||||
0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
|
||||
0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
|
||||
0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
|
||||
0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
|
||||
0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
|
||||
0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
|
||||
0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
|
||||
0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
|
||||
0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
|
||||
0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
|
||||
0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
|
||||
0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
|
||||
0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
|
||||
0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
|
||||
0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
|
||||
0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
|
||||
0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
|
||||
0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
|
||||
0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
|
||||
0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
|
||||
0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
|
||||
0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
|
||||
0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
|
||||
0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
|
||||
0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
|
||||
0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
|
||||
0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
|
||||
0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
|
||||
0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
|
||||
0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
|
||||
0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
|
||||
0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
|
||||
0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
|
||||
0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
|
||||
0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
|
||||
0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
|
||||
0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
|
||||
0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
|
||||
0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
|
||||
0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
|
||||
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
|
||||
0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
|
||||
0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
|
||||
0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
|
||||
0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
|
||||
0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
|
||||
0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
|
||||
0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
|
||||
0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
|
||||
0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
|
||||
0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
|
||||
0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
|
||||
0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
|
||||
0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
|
||||
0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
|
||||
0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
|
||||
0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
|
||||
0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
|
||||
0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
|
||||
0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
|
||||
0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
|
||||
0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
|
||||
0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
|
||||
0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
|
||||
0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
|
||||
0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
|
||||
0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
|
||||
0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
|
||||
0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
|
||||
0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
|
||||
0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
|
||||
0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
|
||||
0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
|
||||
0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
|
||||
0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
|
||||
0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
|
||||
0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
|
||||
0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
|
||||
0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
|
||||
0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
|
||||
0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
|
||||
0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
|
||||
0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
|
||||
0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
|
||||
0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
|
||||
0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
|
||||
0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
|
||||
0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
|
||||
0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
@@ -3795,6 +4076,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xs_grid[512] = {
|
||||
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
|
||||
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
|
||||
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
|
||||
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
|
||||
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
|
||||
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
|
||||
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
|
||||
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
|
||||
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
|
||||
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
|
||||
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
|
||||
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
|
||||
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
|
||||
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
|
||||
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
|
||||
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
|
||||
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
|
||||
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
|
||||
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
|
||||
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
|
||||
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
|
||||
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
|
||||
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
|
||||
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
|
||||
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
|
||||
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
|
||||
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
|
||||
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
|
||||
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
|
||||
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
|
||||
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
|
||||
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
|
||||
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
|
||||
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
|
||||
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
|
||||
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
|
||||
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
|
||||
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
|
||||
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
|
||||
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
|
||||
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
|
||||
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
|
||||
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
|
||||
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
|
||||
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
|
||||
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
|
||||
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
|
||||
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
|
||||
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
|
||||
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
|
||||
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
|
||||
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
|
||||
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
|
||||
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
|
||||
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
|
||||
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
|
||||
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
|
||||
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
|
||||
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
|
||||
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
|
||||
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
|
||||
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
|
||||
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
|
||||
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
|
||||
};
|
||||
|
||||
#define NGRID_IQ1S 512
|
||||
constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
|
||||
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
||||
@@ -4361,6 +4709,269 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq3_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
||||
{
|
||||
int nval = 8;
|
||||
int pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq3_s * xr = x + ibl;
|
||||
device const uint8_t * qs = xr->qs + 8 * ib;
|
||||
device const uint8_t * qh = xr->qh + ib;
|
||||
device const uint8_t * sc = xr->scales + (ib/2);
|
||||
device const uint8_t * signs = xr->signs + 4 * ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
|
||||
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
|
||||
sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
|
||||
}
|
||||
}
|
||||
sumf[row] += d * (sum[0] + sum[1]);
|
||||
|
||||
dh += nb*sizeof(block_iq3_s)/2;
|
||||
qs += nb*sizeof(block_iq3_s);
|
||||
qh += nb*sizeof(block_iq3_s);
|
||||
sc += nb*sizeof(block_iq3_s);
|
||||
signs += nb*sizeof(block_iq3_s);
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq3_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq3_s_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq2_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
//threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
|
||||
//{
|
||||
// int nval = 32;
|
||||
// int pos = (32*sgitg + tiisg)*nval;
|
||||
// for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
|
||||
// threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//}
|
||||
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq2_s * xr = x + ibl;
|
||||
device const uint8_t * qs = xr->qs + 4 * ib;
|
||||
device const uint8_t * qh = xr->qh + ib;
|
||||
device const uint8_t * sc = xr->scales + ib;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const float d1 = db * (0.5f + (sc[0] & 0xf));
|
||||
const float d2 = db * (0.5f + (sc[0] >> 4));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
//const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
||||
//const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
|
||||
sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
|
||||
}
|
||||
}
|
||||
sumf[row] += d1 * sum[0] + d2 * sum[1];
|
||||
|
||||
dh += nb*sizeof(block_iq2_s)/2;
|
||||
qs += nb*sizeof(block_iq2_s);
|
||||
qh += nb*sizeof(block_iq2_s);
|
||||
sc += nb*sizeof(block_iq2_s);
|
||||
signs += nb*sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq2_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq2_s_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq1_s_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -4952,6 +5563,50 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 8*ib32;
|
||||
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
||||
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
||||
grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
||||
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
||||
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
@@ -5525,6 +6180,8 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5566,6 +6223,8 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -5619,6 +6278,8 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
@@ -6589,6 +7250,136 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq3_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq3_s_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq3_s_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq2_s_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq2_s_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||
device const char * ids,
|
||||
|
||||
+25
-25
@@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
||||
}
|
||||
|
||||
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
||||
if (tensor->backend != GGML_BACKEND_GPU) {
|
||||
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
@@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
||||
}
|
||||
|
||||
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
@@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
size_t x_offset = 0;
|
||||
|
||||
@@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// copy src1 to device
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
}
|
||||
|
||||
@@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
@@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
if (src1->backend != GGML_BACKEND_GPU) {
|
||||
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
}
|
||||
if (dst->backend != GGML_BACKEND_GPU) {
|
||||
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
}
|
||||
@@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
size_t y_size;
|
||||
size_t d_size;
|
||||
cl_mem d_X;
|
||||
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
|
||||
d_X = (cl_mem) src0->extra;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
@@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
@@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
@@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
if (src0->backend != GGML_BACKEND_GPU) {
|
||||
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
@@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Q;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
}
|
||||
|
||||
@@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
if (!mul_mat_vec) {
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
@@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
@@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_cl_pool_free(d_Q, q_size);
|
||||
}
|
||||
}
|
||||
@@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
tensor->extra = dst;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
}
|
||||
|
||||
// ggml-backend
|
||||
@@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ctx->sub_buffers.push_back(sub_buffer);
|
||||
tensor->extra = sub_buffer;
|
||||
}
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
|
||||
+1455
-96
File diff suppressed because it is too large
Load Diff
@@ -182,6 +182,15 @@ typedef struct {
|
||||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
// 2.5625 bpw quants
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t scales[QK_K/32];
|
||||
} block_iq2_s;
|
||||
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
||||
|
||||
// (Almost) "true" 3-bit quantization.
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
@@ -191,6 +200,21 @@ typedef struct {
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/4];
|
||||
uint8_t qh[QK_K/32];
|
||||
uint8_t signs[QK_K/8];
|
||||
uint8_t scales[IQ3S_N_SCALE];
|
||||
} block_iq3_s;
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK_K/8];
|
||||
@@ -226,6 +250,8 @@ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGM
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
@@ -242,6 +268,8 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -259,9 +287,11 @@ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRI
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -277,18 +307,22 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
//
|
||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
+242
-158
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
||||
|
||||
size_t total_elements = ggml_nelements(src);
|
||||
|
||||
const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
float *src_data =NULL;
|
||||
if(src_on_device) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
} else {
|
||||
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
||||
swap(dst_row[col], dst_row[ixj]);
|
||||
}
|
||||
}
|
||||
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
||||
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
||||
}
|
||||
|
||||
static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
|
||||
const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int rowx = item_ct1.get_group(2);
|
||||
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
||||
|
||||
const int block_size = item_ct1.get_local_range(2);
|
||||
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
||||
|
||||
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
|
||||
float slope = 0.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const uint32_t h = rowx/nrows_y; // head index
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = sycl::pow(base, float(exp));
|
||||
}
|
||||
|
||||
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
||||
float max_val = -INFINITY;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
|
||||
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = sycl::max(max_val, val);
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = -INFINITY;
|
||||
}
|
||||
/*
|
||||
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = max_val;
|
||||
}
|
||||
/*
|
||||
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
max_val = buf[lane_id];
|
||||
max_val = warp_reduce_max(max_val, item_ct1);
|
||||
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
|
||||
float tmp = 0.f;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
const float val =
|
||||
sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = sycl::native::exp(vals[col] - max_val);
|
||||
tmp += val;
|
||||
dst[ix] = val;
|
||||
vals[col] = val;
|
||||
}
|
||||
|
||||
// find the sum of exps in the block
|
||||
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
||||
if (warp_id == 0) {
|
||||
buf[lane_id] = 0.f;
|
||||
}
|
||||
/*
|
||||
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf[warp_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
/*
|
||||
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||
better performance if there is no access to global memory.
|
||||
*/
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
tmp = buf[lane_id];
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
|
||||
const float inv_tmp = 1.f / tmp;
|
||||
const float inv_sum = 1.f / tmp;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = rowx*ncols + col;
|
||||
dst[i] *= inv_tmp;
|
||||
#pragma unroll
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
const int col = col0 + tid;
|
||||
|
||||
if (ncols_template == 0 && col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int idst = rowx*ncols + col;
|
||||
dst[idst] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
|
||||
const sycl::range<3> block_dims(1, 1, ncols);
|
||||
const sycl::range<3> block_nums(1, nrows, 1);
|
||||
if (order == GGML_SORT_ASC) {
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
/*
|
||||
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
||||
});
|
||||
} else if (order == GGML_SORT_DESC) {
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
/*
|
||||
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
||||
});
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -10867,35 +10869,96 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale,
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
local_buf_acc.get_pointer());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
||||
float * dst, const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale, const float max_bias,
|
||||
dpct::queue_ptr stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const sycl::range<3> block_dims(1, 1, nth);
|
||||
const sycl::range<3> block_nums(1, 1, nrows_x);
|
||||
/*
|
||||
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
||||
limit. To get the device limit, query info::device::max_work_group_size.
|
||||
Adjust the work-group size if needed.
|
||||
*/
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
/*
|
||||
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
||||
replaced with a value. Modify the code to use the original expression,
|
||||
provided in comments, if it is correct.
|
||||
*/
|
||||
sycl::local_accessor<float, 1> buf_acc_ct1(
|
||||
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
||||
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
||||
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
|
||||
buf_acc_ct1.get_pointer());
|
||||
});
|
||||
});
|
||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
||||
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, n_local_scratch, stream);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
||||
max_bias, m0, m1, n_head_log2, block_nums,
|
||||
block_dims, WARP_SIZE, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||
|
||||
dpct::memcpy_direction kind;
|
||||
char * src_ptr;
|
||||
if (src->backend == GGML_BACKEND_CPU) {
|
||||
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
kind = dpct::host_to_device;
|
||||
src_ptr = (char *) src->data;
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
|
||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
||||
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||
kind = dpct::device_to_device;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
int id;
|
||||
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
float scale = 1.0f;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
float max_bias = 0.0f;
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
(void) dst;
|
||||
// positions tensor
|
||||
float * src2_dd = nullptr;
|
||||
sycl_pool_alloc<float> src2_f;
|
||||
|
||||
ggml_tensor * src2 = dst->src[2];
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
|
||||
if (use_src2) {
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (src2_on_device) {
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||
} else {
|
||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
||||
}
|
||||
}
|
||||
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
||||
nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = nullptr;
|
||||
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
||||
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
}
|
||||
@@ -12640,8 +12724,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
|
||||
@@ -12656,13 +12740,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||
|
||||
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
@@ -12717,8 +12801,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||
@@ -12782,8 +12866,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
||||
const int64_t row_diff = row_high[id] - row_low[id];
|
||||
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
@@ -12809,12 +12893,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
|
||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||
// in that case an offset on dst_ddf_i is needed
|
||||
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
||||
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
||||
}
|
||||
|
||||
// copy src0, src1 to device if necessary
|
||||
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
||||
if (id != g_main_device_index) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
||||
@@ -12830,14 +12914,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
@@ -12867,10 +12951,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
if (!dst_on_device) {
|
||||
void * dst_off_device;
|
||||
dpct::memcpy_direction kind;
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
dst_off_device = dst->data;
|
||||
kind = dpct::device_to_host;
|
||||
} else if (dst->backend == GGML_BACKEND_GPU) {
|
||||
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
dst_off_device = dst_extra->data_device[g_main_device_index];
|
||||
kind = dpct::device_to_device;
|
||||
} else {
|
||||
@@ -12954,7 +13038,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_current_device().queues_wait_and_throw()));
|
||||
@@ -13091,7 +13175,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -13129,7 +13213,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -13196,7 +13280,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -13372,11 +13456,11 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
@@ -13505,7 +13589,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src00));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
||||
@@ -13643,7 +13727,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||
|
||||
if (ids->backend == GGML_BACKEND_GPU) {
|
||||
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||
@@ -13661,20 +13745,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.backend = GGML_BACKEND_GPU;
|
||||
dst_row.backend = GGML_BACKEND_GPU;
|
||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
|
||||
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
||||
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
|
||||
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
||||
|
||||
if (src1->ne[1] == 1) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||
//int32_t row_id;
|
||||
@@ -13756,7 +13840,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
}
|
||||
}
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||
}
|
||||
}
|
||||
@@ -13779,8 +13863,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||
@@ -13887,17 +13971,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
||||
continue;
|
||||
}
|
||||
ggml_sycl_set_device(get_device_id_by_index(id));
|
||||
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
||||
|
||||
int64_t row_low, row_high;
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU) {
|
||||
row_low = 0;
|
||||
row_high = nrows;
|
||||
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
const int64_t rounding = get_row_rounding(tensor->type);
|
||||
|
||||
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
||||
@@ -13946,7 +14030,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
||||
|
||||
extra->data_device[id] = buf;
|
||||
|
||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
||||
new sycl::event()));
|
||||
@@ -13963,7 +14047,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -14016,15 +14100,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||
return;
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
const ggml_op src0_op = tensor->src[0]->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
}
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
|
||||
@@ -14042,7 +14126,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
||||
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||
size_t offset = 0;
|
||||
@@ -14111,7 +14195,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
||||
|
||||
const bool inplace = tensor->view_src != nullptr;
|
||||
|
||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
||||
size_t view_offset = 0;
|
||||
@@ -14132,7 +14216,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
@@ -14219,9 +14303,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
if (!g_sycl_loaded) return false;
|
||||
|
||||
ggml_sycl_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
||||
return false;
|
||||
@@ -14359,14 +14443,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
||||
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
||||
}
|
||||
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
func(tensor->src[0], tensor->src[1], tensor);
|
||||
@@ -14517,7 +14601,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
extra->data_device[ctx->device] = tensor->data;
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
tensor->extra = extra;
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
@@ -14548,7 +14632,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -14573,7 +14657,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -14809,7 +14893,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||
(char *)tensor->data + offset, data, size)));
|
||||
@@ -14827,7 +14911,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
||||
data, (const char *)tensor->data + offset, size)));
|
||||
@@ -14880,7 +14964,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -14888,13 +14972,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
||||
continue;
|
||||
|
||||
assert(node->backend == GGML_BACKEND_GPU);
|
||||
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
||||
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||
assert(node->extra != nullptr);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
||||
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
||||
assert(node->src[j]->extra != nullptr);
|
||||
}
|
||||
|
||||
+51
-51
@@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
||||
// compute
|
||||
ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data);
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
||||
@@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
@@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
@@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
@@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
@@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||
// TODO: support for transposed / permuted tensors
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
@@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
@@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
|
||||
@@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
||||
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
||||
} else if(dst->backend == GGML_BACKEND_CPU) {
|
||||
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
||||
@@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
||||
}
|
||||
@@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||
|
||||
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// If backend is CPU, data from src0 has to be copied off the device
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
@@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
||||
#endif
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
||||
|
||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
|
||||
return;
|
||||
@@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
|
||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
||||
return;
|
||||
@@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
last_node = true;
|
||||
#endif
|
||||
|
||||
if (node->backend == GGML_BACKEND_CPU || last_node) {
|
||||
if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
|
||||
ggml_vk_ctx_end(ctx->compute_ctx);
|
||||
ctx->compute_ctx->exit_tensor = node;
|
||||
ctx->compute_ctx = nullptr;
|
||||
@@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
|
||||
return false;
|
||||
@@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return true;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
@@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
|
||||
@@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
|
||||
@@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
||||
#endif
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
@@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
||||
#endif
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
@@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
@@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
}
|
||||
|
||||
ggml_compute_params params = {};
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
params.ith = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
@@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
@@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src0_buffer = malloc(src0_size);
|
||||
src0_clone->data = src0_buffer;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
memcpy(src0_clone->data, src0->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
|
||||
@@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src1_buffer = malloc(src1_size);
|
||||
src1_clone->data = src1_buffer;
|
||||
if (src1->backend == GGML_BACKEND_CPU) {
|
||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
memcpy(src1_clone->data, src1->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src1->backend == GGML_BACKEND_GPU) {
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
|
||||
@@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
|
||||
return;
|
||||
}
|
||||
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
||||
@@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
comp_result = nullptr;
|
||||
comp_size = 0;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -315,13 +315,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
typedef half ggml_fp16_t;
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
#endif
|
||||
|
||||
// convert FP16 <-> FP32
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
@@ -356,6 +350,8 @@ extern "C" {
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_IQ3_S = 21,
|
||||
GGML_TYPE_IQ2_S = 22,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -369,9 +365,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_GPU = 10,
|
||||
GGML_BACKEND_GPU_SPLIT = 20,
|
||||
GGML_BACKEND_TYPE_CPU = 0,
|
||||
GGML_BACKEND_TYPE_GPU = 10,
|
||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
@@ -395,6 +391,8 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@@ -502,9 +500,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
enum ggml_object_type {
|
||||
GGML_OBJECT_TENSOR,
|
||||
GGML_OBJECT_GRAPH,
|
||||
GGML_OBJECT_WORK_BUFFER
|
||||
GGML_OBJECT_TYPE_TENSOR,
|
||||
GGML_OBJECT_TYPE_GRAPH,
|
||||
GGML_OBJECT_TYPE_WORK_BUFFER
|
||||
};
|
||||
|
||||
enum ggml_log_level {
|
||||
@@ -646,9 +644,9 @@ extern "C" {
|
||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
||||
enum ggml_task_type {
|
||||
GGML_TASK_INIT = 0,
|
||||
GGML_TASK_COMPUTE,
|
||||
GGML_TASK_FINALIZE,
|
||||
GGML_TASK_TYPE_INIT = 0,
|
||||
GGML_TASK_TYPE_COMPUTE,
|
||||
GGML_TASK_TYPE_FINALIZE,
|
||||
};
|
||||
|
||||
struct ggml_compute_params {
|
||||
@@ -1653,8 +1651,8 @@ extern "C" {
|
||||
|
||||
// sort rows
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ASC,
|
||||
GGML_SORT_DESC,
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_argsort(
|
||||
@@ -1947,8 +1945,8 @@ extern "C" {
|
||||
|
||||
// optimization methods
|
||||
enum ggml_opt_type {
|
||||
GGML_OPT_ADAM,
|
||||
GGML_OPT_LBFGS,
|
||||
GGML_OPT_TYPE_ADAM,
|
||||
GGML_OPT_TYPE_LBFGS,
|
||||
};
|
||||
|
||||
// linesearch methods
|
||||
@@ -1962,12 +1960,12 @@ extern "C" {
|
||||
|
||||
// optimization return values
|
||||
enum ggml_opt_result {
|
||||
GGML_OPT_OK = 0,
|
||||
GGML_OPT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_NO_CONTEXT,
|
||||
GGML_OPT_INVALID_WOLFE,
|
||||
GGML_OPT_FAIL,
|
||||
GGML_OPT_CANCEL,
|
||||
GGML_OPT_RESULT_OK = 0,
|
||||
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_RESULT_NO_CONTEXT,
|
||||
GGML_OPT_RESULT_INVALID_WOLFE,
|
||||
GGML_OPT_RESULT_FAIL,
|
||||
GGML_OPT_RESULT_CANCEL,
|
||||
|
||||
GGML_LINESEARCH_FAIL = -128,
|
||||
GGML_LINESEARCH_MINIMUM_STEP,
|
||||
|
||||
@@ -64,6 +64,15 @@ extern "C" {
|
||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||
enum llama_rope_type {
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
@@ -98,32 +107,36 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
enum llama_rope_scaling_type {
|
||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_NONE = 0,
|
||||
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||
LLAMA_ROPE_SCALING_YARN = 2,
|
||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
||||
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
||||
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
||||
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
||||
};
|
||||
|
||||
enum llama_pooling_type {
|
||||
LLAMA_POOLING_NONE = 0,
|
||||
LLAMA_POOLING_MEAN = 1,
|
||||
LLAMA_POOLING_CLS = 2,
|
||||
LLAMA_POOLING_TYPE_NONE = 0,
|
||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||
LLAMA_POOLING_TYPE_CLS = 2,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
@@ -171,9 +184,9 @@ extern "C" {
|
||||
} llama_batch;
|
||||
|
||||
enum llama_model_kv_override_type {
|
||||
LLAMA_KV_OVERRIDE_INT,
|
||||
LLAMA_KV_OVERRIDE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_BOOL,
|
||||
LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
@@ -232,6 +245,7 @@ extern "C" {
|
||||
float yarn_beta_fast; // YaRN low correction dim
|
||||
float yarn_beta_slow; // YaRN high correction dim
|
||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
@@ -358,6 +372,7 @@ extern "C" {
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
@@ -512,10 +527,12 @@ extern "C" {
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_shift(
|
||||
LLAMA_API void llama_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@@ -523,7 +540,9 @@ extern "C" {
|
||||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_div(
|
||||
@@ -533,6 +552,20 @@ extern "C" {
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id);
|
||||
|
||||
// Defragment the KV cache
|
||||
// This will be applied:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
||||
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
||||
@@ -1 +1 @@
|
||||
30805514e1bf389a59d30a54a0525cbdc30d5bd1
|
||||
8cdf783f288a98eddf521b0ab1b4d405be9e18ba
|
||||
|
||||
@@ -1264,7 +1264,7 @@ struct test_argsort : public test_case {
|
||||
|
||||
test_argsort(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {16, 10, 10, 10},
|
||||
ggml_sort_order order = GGML_SORT_ASC)
|
||||
ggml_sort_order order = GGML_SORT_ORDER_ASC)
|
||||
: type(type), ne(ne), order(order) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
@@ -1916,9 +1916,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
@@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
|
||||
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
|
||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||
}
|
||||
|
||||
@@ -29,6 +29,8 @@ int main(void) {
|
||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||
// google/gemma-7b-it
|
||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
@@ -41,6 +43,8 @@ int main(void) {
|
||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||
// google/gemma-7b-it
|
||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
||||
+1
-1
@@ -118,7 +118,7 @@ int main(void) {
|
||||
const float fe = ggml_get_f32_1d(e, 0);
|
||||
printf("%s: e = %.4f\n", __func__, fe);
|
||||
|
||||
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
|
||||
ggml_opt(ctx, opt_params, e);
|
||||
|
||||
|
||||
@@ -150,7 +150,9 @@ int main(int argc, char * argv[]) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
@@ -167,7 +169,9 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR;
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
|
||||
@@ -404,7 +404,8 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||
const auto it = codepoint_types.find(cp);
|
||||
return it == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
|
||||
Reference in New Issue
Block a user