Compare commits

..

20 Commits

Author SHA1 Message Date
Georgi Gerganov b5af7ad84f llama : refactor quantization to avoid <mutex> header
ggml-ci
2024-01-02 15:56:57 +02:00
postmasters 83e633c27e llama : differentiate the KV dims in the attention (#4657)
* Add n_key_dim and n_value_dim

Some models use values that are not derived from `n_embd`.
Also remove `n_embd_head` and `n_embd_gqa` because it is not clear
which "head" is referred to (key or value).

Fix issue #4648.

* Fix `llm_build_kqv` to use `n_value_gqa`

* Rebase

* Rename variables

* Fix llm_build_kqv to be more generic wrt n_embd_head_k

* Update default values for n_embd_head_k and n_embd_head_v

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix llm_load_tensors: the asserts were not backcompat

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-02 13:51:28 +02:00
Georgi Gerganov 32866c5edd editorconfig : fix whitespace and indentation #4710 2024-01-02 13:28:15 +02:00
minarchist 5d7002d437 server : add --override-kv parameter (#4710)
* Changes to server to allow metadata override

* documentation

* flake.nix: expose full scope in legacyPackages

* flake.nix: rocm not yet supported on aarch64, so hide the output

* flake.nix: expose checks

* workflows: nix-ci: init; build flake outputs

* workflows: nix-ci: add a job for eval

* workflows: weekly `nix flake update`

* workflows: nix-flakestry: drop tag filters

...and add a job for flakehub.com

* workflows: nix-ci: add a qemu job for jetsons

* flake.nix: suggest the binary caches

* flake.lock: update

to a commit recently cached by nixpkgs-cuda-ci

---------

Co-authored-by: John <john@jLap.lan>
Co-authored-by: Someone Serge <sergei.kozlukov@aalto.fi>
2024-01-02 12:38:15 +02:00
Nam D. Tran 26f3071d71 py : re-enable mmap in convert hf (#4732)
* update: awq support llama-7b model

* update: change order

* update: benchmark results for llama2-7b

* update: mistral 7b v1 benchmark

* update: support 4 models

* fix: Readme

* update: ready for PR

* update: readme

* fix: readme

* update: change order import

* black

* format code

* update: work for bot mpt and awqmpt

* update: readme

* Rename to llm_build_ffn_mpt_awq

* Formatted other files

* Fixed params count

* fix: remove code

* update: more detail for mpt

* fix: readme

* fix: readme

* update: change folder architecture

* fix: common.cpp

* fix: readme

* fix: remove ggml_repeat

* update: cicd

* update: cicd

* uppdate: remove use_awq arg

* update: readme

* llama : adapt plamo to new ffn

ggml-ci

* fix: update torch version

---------

Co-authored-by: Trần Đức Nam <v.namtd12@vinai.io>
Co-authored-by: Le Hoang Anh <v.anhlh33@vinai.io>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-02 11:23:38 +02:00
Daniel Bevenius 775ac8712a finetune: fix typo in README.md (#4733)
Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
2024-01-02 10:16:55 +01:00
Georgi Gerganov 58ba655af0 metal : enable shader debugging (cmake option) (#4705)
* ggml : disable fast-math for Metal (cmake build only)

ggml-ci

* metal : fix Metal API debug warnings

* cmake : add -fno-inline for Metal build (#4545)

* metal : fix API debug warnings

* metal : fix compile warnings

* metal : use uint64_t for strides

* cmake : rename option to LLAMA_METAL_SHADER_DEBUG

* metal : fix mat-vec Q8_0 kernel for BS > 1

* metal : normalize mat-vec kernel signatures

* cmake : respect LLAMA_QKK_64 option

* metal : fix mat-vec Q4_K kernel for QK_K == 64

ggml-ci
2024-01-02 10:57:44 +02:00
Someone Serge edd1ab7bc3 flake.lock: update
to a commit recently cached by nixpkgs-cuda-ci
2023-12-31 13:14:58 -08:00
Someone Serge 198ed7ebfc flake.nix: suggest the binary caches 2023-12-31 13:14:58 -08:00
Someone Serge d836174731 workflows: nix-ci: add a qemu job for jetsons 2023-12-31 13:14:58 -08:00
Someone Serge 06f2a5d190 workflows: nix-flakestry: drop tag filters
...and add a job for flakehub.com
2023-12-31 13:14:58 -08:00
Someone Serge c5239944ba workflows: weekly nix flake update 2023-12-31 13:14:58 -08:00
Someone Serge 1e9ae54cf2 workflows: nix-ci: add a job for eval 2023-12-31 13:14:58 -08:00
Someone Serge 7adedecbe3 workflows: nix-ci: init; build flake outputs 2023-12-31 13:14:58 -08:00
Someone Serge 356ea17e0f flake.nix: expose checks 2023-12-31 13:14:58 -08:00
Someone Serge a5c088d8c6 flake.nix: rocm not yet supported on aarch64, so hide the output 2023-12-31 13:14:58 -08:00
Someone Serge 1e3900ebac flake.nix: expose full scope in legacyPackages 2023-12-31 13:14:58 -08:00
Georgi Gerganov e39106c055 ggml : add ggml_vdotq_s32 alias (#4715)
ggml-ci
2023-12-31 11:43:31 +02:00
Georgi Gerganov 9fbda719de clip : refactor + bug fixes (#4696)
* clip : refactor + bug fixes

ggml-ci

* server : add log message
2023-12-30 23:24:42 +02:00
Johannes Gäßler 39d8bc71ed CUDA: fixed tensor cores not being used on RDNA3 (#4697) 2023-12-30 13:52:01 +01:00
25 changed files with 1115 additions and 643 deletions
+13 -6
View File
@@ -8,12 +8,13 @@
pkgsCuda,
...
}:
lib.optionalAttrs (system == "aarch64-linux") {
packages =
{
legacyPackages =
let
caps.jetson-xavier = "7.2";
caps.jetson-orin = "8.7";
caps.jetson-nano = "5.3";
caps.llamaPackagesXavier = "7.2";
caps.llamaPackagesOrin = "8.7";
caps.llamaPackagesTX2 = "6.2";
caps.llamaPackagesNano = "5.3";
pkgsFor =
cap:
@@ -27,6 +28,12 @@
};
};
in
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
packages = lib.optionalAttrs (system == "aarch64-linux") {
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
};
};
}
-1
View File
@@ -515,7 +515,6 @@ jobs:
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
# freeBSD-latest:
# runs-on: macos-12
# steps:
+112
View File
@@ -0,0 +1,112 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
if: ${{ vars.CACHIX_NAME != '' }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }}
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"
+22
View File
@@ -0,0 +1,22 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.GITHUB_TOKEN }}
-23
View File
@@ -1,23 +0,0 @@
# Make the flake discoverable on https://flakestry.dev
name: "Publish a flake to flakestry"
on:
push:
tags:
- "v?[0-9]+.[0-9]+.[0-9]+"
- "v?[0-9]+.[0-9]+"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
publish-flake:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
+36
View File
@@ -0,0 +1,36 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"
+31 -3
View File
@@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
@@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
@@ -173,6 +174,33 @@ if (LLAMA_METAL)
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
# xcrun -sdk macosx metallib ggml-metal.air -o ggml.metallib
#
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
set(XC_FLAGS -fno-fast-math -fno-inline -g)
if (LLAMA_QKK_64)
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
endif()
add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
DEPENDS ggml-metal.metal
COMMENT "Compiling Metal kernels"
)
add_custom_target(
ggml-metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
+1 -1
View File
@@ -1,2 +1,2 @@
torch>=2.0.0
torch>=2.1.1
transformers>=4.32.0
+10 -4
View File
@@ -30,6 +30,12 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA=""
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi
## helpers
# download a file if it does not exist or if it is outdated
@@ -81,8 +87,8 @@ function gg_run_ctest_debug {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -109,8 +115,8 @@ function gg_run_ctest_release {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+1 -1
View File
@@ -59,7 +59,7 @@ class Model:
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
with ctx as model_part:
for name in model_part.keys():
+1 -1
View File
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
+134 -107
View File
@@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) {
}
}
//
// image data
//
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
//
// clip layers
//
@@ -204,16 +225,21 @@ struct clip_vision_model {
};
struct clip_ctx {
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_llava_projector = false;
struct clip_vision_model vision_model;
float image_mean[3];
float image_std[3];
bool use_gelu = false;
int32_t ftype = 1;
struct ggml_context * ctx;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
@@ -222,7 +248,7 @@ struct clip_ctx {
ggml_allocr * compute_alloc = NULL;
};
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return nullptr;
@@ -243,13 +269,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
//const int projection_dim = hparams.projection_dim;
const float eps = hparams.eps;
int batch_size = imgs->size;
if(ctx->has_llava_projector) {
if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1);
}
struct ggml_init_params params = {
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params);
@@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
}
}
}
@@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
ggml_allocr_alloc(ctx->compute_alloc, patches);
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_positions; i++) {
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
@@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
/*.no_alloc =*/ true,
};
new_clip->ctx = ggml_init(params);
if (!new_clip->ctx) {
new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
return nullptr;
@@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
ggml_set_name(cur, name);
}
@@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
ggml_allocr_alloc(alloc, cur);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
@@ -617,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// load vision model
auto & vision_model = new_clip->vision_model;
auto & hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
}
if (verbosity >= 2) {
@@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("v_n_layer %d\n", hparams.n_layer);
}
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
}
}
@@ -680,8 +707,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_gguf = ctx;
// measure mem requirement and allocate
// measure mem requirement and allocate
{
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
clip_image_f32_batch batch;
batch.size = 1;
@@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}
clip_image_u8 * make_clip_image_u8() {
auto img = new clip_image_u8();
return img;
struct clip_image_u8 * clip_image_u8_init() {
return new clip_image_u8();
}
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
struct clip_image_f32 * clip_image_f32_init() {
return new clip_image_f32();
}
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->size = nx * ny * 3;
img->data = new uint8_t[img->size]();
memcpy(img->data, data, img->size);
img->buf.resize(3 * nx * ny);
memcpy(img->buf.data(), data, img->buf.size());
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
return false;
@@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
return false;
@@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
// normalize: x = (x - mean) / std
// TODO: implement bicubic interpolation instead of linear.
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side;
temp->ny = longer_side;
temp->size = 3 * longer_side * longer_side;
temp->data = new uint8_t[temp->size]();
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
temp->buf.resize(3 * longer_side * longer_side);
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
// fill with background color
for (size_t i = 0; i < temp->size; i++) {
temp->data[i] = bc[i % 3];
for (size_t i = 0; i < temp->buf.size(); i++) {
temp->buf[i] = bc[i % 3];
}
// copy from the input image
@@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp->nx + x);
temp->data[j] = img->data[i];
temp->data[j+1] = img->data[i+1];
temp->data[j+2] = img->data[i+2];
temp->buf[j] = img->buf[i];
temp->buf[j+1] = img->buf[i+1];
temp->buf[j+2] = img->buf[i+2];
}
}
} else {
temp->nx = img->nx;
temp->ny = img->ny;
temp->size = img->size;
temp->data = new uint8_t[temp->size]();
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
temp->nx = img->nx;
temp->ny = img->ny;
temp->buf.resize(img->buf.size());
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
}
const int nx = temp->nx;
@@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
res->nx = nx2;
res->ny = ny2;
res->size = 3 * nx2 * ny2;
res->data = new float[res->size]();
res->buf.resize(3 * nx2 * ny2);
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
@@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp->data[j00];
const float v01 = temp->data[j01];
const float v10 = temp->data[j10];
const float v11 = temp->data[j11];
const float v00 = temp->buf[j00];
const float v01 = temp->buf[j01];
const float v10 = temp->buf[j10];
const float v11 = temp->buf[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx;
@@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int i = 3 * (y * nx3 + x) + c;
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
}
}
}
@@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
}
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx);
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
delete ctx;
}
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
}
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@@ -906,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
case 2:
type = GGML_TYPE_Q4_0;
break;
case 3:
type = GGML_TYPE_Q4_1;
break;
case 6:
type = GGML_TYPE_Q5_0;
break;
case 7:
type = GGML_TYPE_Q5_1;
break;
case 8:
type = GGML_TYPE_Q8_0;
break;
default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false;
case 2:
type = GGML_TYPE_Q4_0;
break;
case 3:
type = GGML_TYPE_Q4_1;
break;
case 6:
type = GGML_TYPE_Q5_0;
break;
case 7:
type = GGML_TYPE_Q5_1;
break;
case 8:
type = GGML_TYPE_Q8_0;
break;
default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false;
};
auto ctx_clip = clip_model_load(fname_inp, 2);
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx;
auto * ctx_clip = clip_model_load(fname_inp, 2);
auto ctx_out = gguf_init_empty();
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data;
auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", itype);
+16 -32
View File
@@ -35,31 +35,14 @@ struct clip_vision_hparams {
float eps;
};
/** load mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
/** free mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
int clip_n_patches(const struct clip_ctx * ctx);
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
uint8_t * data = NULL;
size_t size;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
float * data = NULL;
size_t size;
};
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
struct clip_image_u8_batch {
struct clip_image_u8 * data;
@@ -71,21 +54,22 @@ struct clip_image_f32_batch {
size_t size;
};
struct clip_image_u8 * make_clip_image_u8();
struct clip_image_f32 * make_clip_image_f32();
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
float * vec);
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
#ifdef __cplusplus
}
+2 -2
View File
@@ -10,7 +10,7 @@
#include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
clip_image_f32 * img_res = make_clip_image_f32();
clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res);
@@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
}
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = make_clip_image_u8();
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+68 -21
View File
@@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c)
return (isalnum(c) || (c == '+') || (c == '/'));
}
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
@@ -209,10 +209,10 @@ struct slot_image
int32_t id;
bool request_encode_image = false;
float* image_embedding = nullptr;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 img_data;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
@@ -434,10 +434,12 @@ struct llama_client_slot
generated_token_probs.clear();
for (slot_image &img : images)
for (slot_image & img : images)
{
free(img.image_embedding);
delete[] img.img_data.data;
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
@@ -851,24 +853,17 @@ struct llama_server_context
{
for (const auto &img : *images_data)
{
std::string data_b64 = img["data"].get<std::string>();
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
int width, height, channels;
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
data_b64.clear();
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
if (!data) {
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
return false;
}
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
img_sl.img_data.nx = width;
img_sl.img_data.ny = height;
img_sl.img_data.size = width * height * 3;
img_sl.img_data.data = new uint8_t[width * height * 3]();
memcpy(img_sl.img_data.data, data, width * height * 3);
stbi_image_free(data);
LOG_TEE("slot %i - loaded image\n", slot->id);
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
@@ -1143,8 +1138,8 @@ struct llama_server_context
{
continue;
}
clip_image_f32 img_res;
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
{
LOG_TEE("Error processing the given image");
clip_free(clp_ctx);
@@ -1159,11 +1154,12 @@ struct llama_server_context
return false;
}
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
{
LOG_TEE("Unable to encode image\n");
return false;
}
clip_image_f32_free(img_res);
img.request_encode_image = false;
}
@@ -2020,6 +2016,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf("\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf("\n");
}
static void server_params_parse(int argc, char **argv, server_params &sparams,
@@ -2383,6 +2383,49 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
else if (arg == "--override-kv")
{
if (++i >= argc) {
invalid_param = true;
break;
}
char * sep = strchr(argv[i], '=');
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
params.kv_overrides.push_back(kvo);
}
else
{
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -2390,6 +2433,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
exit(1);
}
}
if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(llama_model_kv_override());
params.kv_overrides.back().key[0] = 0;
}
if (invalid_param)
{
Generated
+3 -3
View File
@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1703559957,
"narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=",
"lastModified": 1703637592,
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380",
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
"type": "github"
},
"original": {
+48 -3
View File
@@ -6,6 +6,29 @@
flake-parts.url = "github:hercules-ci/flake-parts";
};
# Optional binary cache
nixConfig = {
extra-substituters = [
# Populated by the CI in ggerganov/llama.cpp
"https://llama-cpp.cachix.org"
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
# This lets one skip building e.g. the CUDA-enabled openmpi.
# TODO: Replace once nix-community obtains an official one.
"https://cuda-maintainers.cachix.org"
];
# Verify these are the same keys as published on
# - https://app.cachix.org/cache/llama-cpp
# - https://app.cachix.org/cache/cuda-maintainers
extra-trusted-public-keys = [
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
];
};
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
#
# ```bash
@@ -74,26 +97,48 @@
{
config,
lib,
system,
pkgs,
pkgsCuda,
pkgsRocm,
...
}:
{
# Unlike `.#packages`, legacyPackages may contain values of
# arbitrary types (including nested attrsets) and may even throw
# exceptions. This attribute isn't recursed into by `nix flake
# show` either.
#
# You can add arbitrary scripts to `.devops/nix/scope.nix` and
# access them as `nix build .#llamaPackages.${scriptName}` using
# the same path you would with an overlay.
legacyPackages = {
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
};
# We don't use the overlay here so as to avoid making too many instances of nixpkgs,
# cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
packages =
{
default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
default = config.legacyPackages.llamaPackages.llama-cpp;
}
// lib.optionalAttrs pkgs.stdenv.isLinux {
opencl = config.packages.default.override { useOpenCL = true; };
cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
mpi-cpu = config.packages.default.override { useMpi = true; };
mpi-cuda = config.packages.default.override { useMpi = true; };
}
// lib.optionalAttrs (system == "x86_64-linux") {
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
};
# Packages exposed in `.#checks` will be built by the CI and by
# `nix flake check`. Currently we expose all packages, but we could
# make more granular choices
checks = config.packages;
};
};
}
+24 -23
View File
@@ -119,10 +119,29 @@
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
#define GGML_CUDA_MAX_NODES 8192
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// - 7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ
// TODO: improve this to be correct for more hardware
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif
// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
@@ -189,23 +208,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
}
#endif // defined(GGML_USE_HIPBLAS)
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// - 7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ
// TODO: improve this to be correct for more hardware
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ) && (!defined(GGML_USE_HIPBLAS) || defined(RDNA3))
#define CUDA_USE_TENSOR_CORES
#endif
// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
@@ -8661,13 +8663,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
}
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
const bool fp16_performance_good = true;
#ifdef RDNA3
const bool use_mul_mat_q = false;
#else
const bool use_mul_mat_q = true;
#endif // RDNA3
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
bool use_mul_mat_q = ggml_is_quantized(src0->type);
#ifdef CUDA_USE_TENSOR_CORES
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES
#else
+16 -10
View File
@@ -257,13 +257,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif
NSError * error = nil;
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
if (libPath != nil) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
} else {
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
NSString * sourcePath;
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
@@ -291,8 +292,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
options = [MTLCompileOptions new];
options.preprocessorMacros = @{ @"QK_K" : @(64) };
#endif
// disable fast math
// try to disable fast-math
// NOTE: this seems to have no effect whatsoever
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
// and go through the "pre-compiled library found" path above
//[options setFastMathEnabled:false];
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
@@ -1234,7 +1238,7 @@ void ggml_metal_graph_compute(
// not sure how to avoid this
// TODO: make a simpler cpy_bytes kernel
const int nth = MIN(1024, ne00);
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1289,7 +1293,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
const int nth = MIN(1024, ne0);
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
@@ -1789,8 +1793,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -1913,8 +1918,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < n_as; ++j) {
struct ggml_tensor * src_cur = dst->src[2 + j];
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@@ -2233,7 +2239,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
const int nth = MIN(1024, ne0);
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
+265 -210
View File
@@ -59,26 +59,26 @@ kernel void kernel_add(
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & nb00,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & nb03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & nb13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant int64_t & nb0,
constant int64_t & nb1,
constant int64_t & nb2,
constant int64_t & nb3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int64_t & offs,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@@ -109,26 +109,26 @@ kernel void kernel_mul(
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & nb00,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & nb03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & nb13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant int64_t & nb0,
constant int64_t & nb1,
constant int64_t & nb2,
constant int64_t & nb3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
@@ -158,26 +158,26 @@ kernel void kernel_div(
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & nb00,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & nb03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & nb13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant int64_t & nb0,
constant int64_t & nb1,
constant int64_t & nb2,
constant int64_t & nb3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
@@ -205,7 +205,7 @@ kernel void kernel_add_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant int64_t & nb [[buffer(28)]],
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % nb];
}
@@ -214,7 +214,7 @@ kernel void kernel_mul_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant int64_t & nb [[buffer(28)]],
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * src1[tpig % nb];
}
@@ -223,7 +223,7 @@ kernel void kernel_div_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant int64_t & nb [[buffer(28)]],
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] / src1[tpig % nb];
}
@@ -307,26 +307,26 @@ kernel void kernel_sum_rows(
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & nb00,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & nb03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & nb13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant int64_t & nb0,
constant int64_t & nb1,
constant int64_t & nb2,
constant int64_t & nb3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
uint3 tpig[[thread_position_in_grid]]) {
int64_t i3 = tpig.z;
int64_t i2 = tpig.y;
@@ -920,14 +920,21 @@ kernel void kernel_mul_mv_q4_0_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -939,14 +946,21 @@ kernel void kernel_mul_mv_q4_1_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -958,14 +972,21 @@ kernel void kernel_mul_mv_q5_0_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -977,14 +998,21 @@ kernel void kernel_mul_mv_q5_1_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1071,12 +1099,19 @@ kernel void kernel_mul_mv_q8_0_f32(
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -1182,8 +1217,8 @@ kernel void kernel_mul_mv_f32_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
@@ -1209,8 +1244,8 @@ kernel void kernel_mul_mv_f16_f16(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
@@ -1346,8 +1381,8 @@ kernel void kernel_mul_mv_f16_f32_1row(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
@@ -1452,8 +1487,8 @@ kernel void kernel_mul_mv_f16_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
@@ -1478,8 +1513,8 @@ kernel void kernel_mul_mv_f16_f32_l4(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
@@ -1543,7 +1578,8 @@ kernel void kernel_alibi_f32(
const int64_t i3 = n / (ne2*ne1*ne0);
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
const int64_t k = i3*ne3 + i2;
float m_k;
@@ -2410,22 +2446,6 @@ typedef struct {
} block_q6_K;
// 210 bytes / block
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
uchar4 r;
if (j < 4) {
r[0] = q[j+0] & 63;
r[2] = q[j+1] & 63;
r[1] = q[j+4] & 63;
r[3] = q[j+5] & 63;
} else {
r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4);
}
return r;
}
//====================================== dot products =========================
void kernel_mul_mv_q2_K_f32_impl(
@@ -2584,14 +2604,21 @@ kernel void kernel_mul_mv_q2_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -2841,14 +2868,21 @@ kernel void kernel_mul_mv_q3_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -2984,8 +3018,8 @@ void kernel_mul_mv_q4_K_f32_impl(
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int ix = tiisg/4; // 0...7
const int it = tiisg%4; // 0...3
@@ -2994,7 +3028,7 @@ void kernel_mul_mv_q4_K_f32_impl(
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int first_row = r0 * N_DST;
const int ib_row = first_row * nb;
const uint i12 = im%ne12;
@@ -3060,7 +3094,7 @@ void kernel_mul_mv_q4_K_f32_impl(
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum;
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
}
}
}
@@ -3072,14 +3106,21 @@ kernel void kernel_mul_mv_q4_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -3271,14 +3312,21 @@ kernel void kernel_mul_mv_q5_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -3398,14 +3446,21 @@ kernel void kernel_mul_mv_q6_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & r2 [[buffer(17)]],
constant uint & r3 [[buffer(18)]],
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
constant uint & r3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@@ -3523,7 +3578,7 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg
device const int8_t * qs = ((device const int8_t *)xb->qs);
const half d = xb->d;
for (int i=0;i<16;i++) {
for (int i = 0; i < 16; i++) {
reg[i/4][i%4] = (qs[i + 16*il] * d);
}
}
@@ -3792,12 +3847,12 @@ void kernel_mul_mm_impl(device const uchar * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
@@ -3924,12 +3979,12 @@ kernel void kernel_mul_mm(device const uchar * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
@@ -3965,19 +4020,19 @@ kernel void kernel_mul_mm_id(
device const uchar * ids,
device const uchar * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4070,12 +4125,12 @@ typedef void (mat_mm_t)(
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & r2,
@@ -4104,19 +4159,19 @@ typedef void (mat_mm_id_t)(
device const uchar * ids,
device const uchar * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne12,
constant int64_t & ne13,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4153,7 +4208,7 @@ kernel void kernel_mul_mv_id_f32_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4169,7 +4224,7 @@ kernel void kernel_mul_mv_id_f32_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4222,7 +4277,7 @@ kernel void kernel_mul_mv_id_f16_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4238,7 +4293,7 @@ kernel void kernel_mul_mv_id_f16_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4291,7 +4346,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4307,7 +4362,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4354,7 +4409,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4370,7 +4425,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4417,7 +4472,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4433,7 +4488,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4480,7 +4535,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4496,7 +4551,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4543,7 +4598,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4559,7 +4614,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4606,7 +4661,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4622,7 +4677,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4669,7 +4724,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4685,7 +4740,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4732,7 +4787,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4748,7 +4803,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4795,7 +4850,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4811,7 +4866,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
@@ -4858,7 +4913,7 @@ kernel void kernel_mul_mv_id_q6_K_f32(
device const char * ids,
device const char * src1,
device uchar * dst,
constant int64_t & nbi1,
constant uint64_t & nbi1,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
@@ -4874,7 +4929,7 @@ kernel void kernel_mul_mv_id_q6_K_f32(
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & nb1,
constant uint64_t & nb1,
constant uint & r2,
constant uint & r3,
constant int & idx,
+61 -62
View File
@@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif
#endif
@@ -1103,8 +1107,6 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
}
}
#include <stdio.h>
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
static const int qk = QK4_1;
@@ -1112,9 +1114,6 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
const int nb = k / qk;
printf("d = %9f\n", GGML_FP16_TO_FP32(x[0].d));
printf("m = %9f\n", GGML_FP16_TO_FP32(x[0].m));
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
const float m = GGML_FP16_TO_FP32(x[i].m);
@@ -2486,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
@@ -2774,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
@@ -2941,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -3233,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@@ -3488,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -3603,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
// We use this macro instead of a function call because for some reason
// the code runs 2-3% slower, even if the function is declared inline
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@@ -3978,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
sum += d * (isum1 + isum2);
}
@@ -4261,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
scale += 4;
@@ -4278,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
scale += 4;
@@ -4762,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
sum += d * isum;
@@ -5114,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
}
@@ -5454,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
sumf += d * (sumi1 + sumi2);
@@ -5727,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
}
sumf += d * sumi - dmin * sumi_mins;
@@ -6117,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
}
@@ -6404,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
@@ -6431,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
}
//sum += isum * d_all * y[i].d;
@@ -6821,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
sum += isum * d_all * y[i].d;
+2
View File
@@ -46,6 +46,8 @@ class Keys:
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
CLAMP_KQV = "{arch}.attention.clamp_kqv"
KEY_LENGTH = "{arch}.attention.key_length"
VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+6
View File
@@ -333,6 +333,12 @@ class GGUFWriter:
def add_head_count_kv(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
def add_key_length(self, length: int) -> None:
self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
def add_value_length(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
def add_max_alibi_bias(self, bias: float) -> None:
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+233 -113
View File
@@ -68,7 +68,6 @@
#include <initializer_list>
#include <map>
#include <memory>
#include <mutex>
#include <numeric>
#include <queue>
#include <random>
@@ -245,6 +244,8 @@ enum llm_kv {
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_KEY_LENGTH,
LLM_KV_ATTENTION_VALUE_LENGTH,
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
@@ -297,6 +298,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
@@ -1284,6 +1287,8 @@ struct llama_hparams {
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_ff;
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
@@ -1310,6 +1315,8 @@ struct llama_hparams {
if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true;
if (this->n_embd_head_k != other.n_embd_head_k) return true;
if (this->n_embd_head_v != other.n_embd_head_v) return true;
if (this->n_ff != other.n_ff) return true;
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
@@ -1331,12 +1338,12 @@ struct llama_hparams {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
return n_embd_head_k * n_head_kv;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
return n_embd_head_v * n_head_kv;
}
};
@@ -1645,8 +1652,9 @@ static bool llama_kv_cache_init(
uint32_t n_ctx,
int n_gpu_layers,
bool offload) {
const uint32_t n_embd = hparams.n_embd_gqa();
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t n_layer = hparams.n_layer;
cache.has_shift = false;
@@ -1677,8 +1685,8 @@ static bool llama_kv_cache_init(
const int i_gpu_start = (int) n_layer - n_gpu_layers;
for (int i = 0; i < (int) n_layer; i++) {
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
@@ -2672,6 +2680,12 @@ static void llm_load_hparams(
// gpt-j n_rot = rotary_dim
}
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
// arch-specific KVs
switch (model.arch) {
case LLM_ARCH_LLAMA:
@@ -3082,8 +3096,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
@@ -3173,10 +3191,11 @@ static bool llm_load_tensors(
// create tensors for the weights
{
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const int64_t n_layer = hparams.n_layer;
const int64_t n_vocab = hparams.n_vocab;
const auto tn = LLM_TN(model.arch);
switch (model.arch) {
@@ -3202,7 +3221,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3270,7 +3292,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3318,7 +3343,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3368,7 +3396,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3420,7 +3451,11 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
@@ -3469,7 +3504,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3520,7 +3558,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3567,7 +3608,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3665,7 +3709,10 @@ static bool llm_load_tensors(
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3714,7 +3761,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3761,7 +3811,10 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
}
const uint32_t n_ff = hparams.n_ff;
const uint32_t n_ff = hparams.n_ff;
const int64_t n_embd_gqa = n_embd_v_gqa;
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -4000,8 +4053,8 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
// Persimmon: n_rot = n_embd_head/2
// Other: n_rot = n_embd_head
// Persimmon: n_rot = n_embd_head_k/2
// Other: n_rot = n_embd_head_k
static void llm_build_k_shift(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -4014,17 +4067,17 @@ static void llm_build_k_shift(
float freq_base,
float freq_scale,
const llm_build_cb & cb) {
const int64_t n_layer = hparams.n_layer;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const int64_t n_embd_head = hparams.n_embd_head();
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
const float ext_factor = cparams.yarn_ext_factor;
const float attn_factor = cparams.yarn_attn_factor;
const float beta_fast = cparams.yarn_beta_fast;
const float beta_slow = cparams.yarn_beta_slow;
const int64_t n_layer = hparams.n_layer;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
const float ext_factor = cparams.yarn_ext_factor;
const float attn_factor = cparams.yarn_attn_factor;
const float beta_fast = cparams.yarn_beta_fast;
const float beta_slow = cparams.yarn_beta_slow;
GGML_ASSERT(n_embd_head % n_rot == 0);
GGML_ASSERT(n_embd_head_k % n_rot == 0);
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);
@@ -4042,9 +4095,9 @@ static void llm_build_k_shift(
// we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_head_kv, n_ctx,
ggml_row_size(kv.k_l[il]->type, n_embd_head),
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
n_embd_head_k, n_head_kv, n_ctx,
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
0),
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
@@ -4065,18 +4118,19 @@ static void llm_build_kv_store(
int32_t kv_head,
const llm_build_cb & cb,
int64_t il) {
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
// compute the transposed [n_tokens, n_embd] V matrix
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
cb(v_cur_t, "v_cur_t", il);
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
cb(k_cache_view, "k_cache_view", il);
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
( n_ctx)*ggml_element_size(kv.v_l[il]),
(kv_head)*ggml_element_size(kv.v_l[il]));
cb(v_cache_view, "v_cache_view", il);
@@ -4226,20 +4280,20 @@ static struct ggml_tensor * llm_build_kqv(
float kq_scale,
const llm_build_cb & cb,
int il) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_head_v = hparams.n_embd_head_v;
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
cb(q, "q", il);
struct ggml_tensor * k =
ggml_view_3d(ctx, kv.k_l[il],
n_embd_head, n_kv, n_head_kv,
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
ggml_row_size(kv.k_l[il]->type, n_embd_head),
n_embd_head_k, n_kv, n_head_kv,
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
0);
cb(k, "k", il);
@@ -4278,9 +4332,9 @@ static struct ggml_tensor * llm_build_kqv(
// split cached v into n_head heads
struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il],
n_kv, n_embd_head, n_head_kv,
n_kv, n_embd_head_v, n_head_kv,
ggml_element_size(kv.v_l[il])*n_ctx,
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
0);
cb(v, "v", il);
@@ -4290,7 +4344,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
cur = ggml_mul_mat(ctx, wo, cur);
@@ -4317,8 +4371,10 @@ struct llm_build_context {
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head;
const int64_t n_embd_gqa;
const int64_t n_embd_head_k;
const int64_t n_embd_k_gqa;
const int64_t n_embd_head_v;
const int64_t n_embd_v_gqa;
const int64_t n_expert;
const int64_t n_expert_used;
@@ -4360,8 +4416,10 @@ struct llm_build_context {
n_ctx (cparams.n_ctx),
n_head (hparams.n_head),
n_head_kv (hparams.n_head_kv),
n_embd_head (hparams.n_embd_head()),
n_embd_gqa (hparams.n_embd_gqa()),
n_embd_head_k (hparams.n_embd_head_k),
n_embd_k_gqa (hparams.n_embd_k_gqa()),
n_embd_head_v (hparams.n_embd_head_v),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (hparams.n_expert_used),
freq_base (cparams.rope_freq_base),
@@ -4404,6 +4462,8 @@ struct llm_build_context {
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
@@ -4588,6 +4648,9 @@ struct llm_build_context {
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -4705,6 +4768,11 @@ struct llm_build_context {
struct ggml_cgraph * build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -4824,6 +4892,11 @@ struct llm_build_context {
struct ggml_cgraph * build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
@@ -4920,7 +4993,12 @@ struct llm_build_context {
struct ggml_cgraph * build_persimmon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_rot = n_embd_head / 2;
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
const int64_t n_rot = n_embd_head_k / 2;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5129,6 +5207,11 @@ struct llm_build_context {
struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5217,6 +5300,11 @@ struct llm_build_context {
struct ggml_cgraph * build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5308,6 +5396,11 @@ struct llm_build_context {
struct ggml_cgraph * build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5403,6 +5496,9 @@ struct llm_build_context {
struct ggml_cgraph * build_stablelm() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5513,6 +5609,9 @@ struct llm_build_context {
struct ggml_cgraph * build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5624,6 +5723,11 @@ struct llm_build_context {
struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * attn_norm_output;
struct ggml_tensor * ffn_output;
@@ -5736,6 +5840,9 @@ struct llm_build_context {
struct ggml_cgraph * build_plamo() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@@ -5840,6 +5947,11 @@ struct llm_build_context {
struct ggml_cgraph * build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_gqa == n_embd);
struct ggml_tensor * cur;
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
@@ -8972,7 +9084,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers;
workers.reserve(nthread);
std::mutex mutex;
int idx = 0;
@@ -9046,7 +9157,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const size_t nelements = ggml_nelements(tensor);
const size_t ne = ggml_nelements(tensor);
float * f32_data;
@@ -9055,53 +9166,60 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, ne, nthread);
f32_data = (float *) f32_conv_buf.data();
}
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
fflush(stdout);
if (work.size() < nelements * 4) {
work.resize(nelements * 4); // upper bound on size
if (work.size() < ne * 4) {
work.resize(ne * 4); // upper bound on size
}
new_data = work.data();
std::array<int64_t, 1 << 4> hist_cur = {};
static const int chunk_size = 32 * 512;
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
} else {
size_t counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
std::array<int64_t, 1 << 4> local_hist = {};
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
size_t first = counter; counter += chunk_size;
if (first >= nelements) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
{
static const size_t chunk_size = 32*512;
const int nchunk = GGML_PAD(ne, chunk_size)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
std::vector<size_t> size_th(nthread_use, 0);
std::vector<std::array<int64_t, 1 << 4>> hist_cur_th(nthread_use);
auto compute = [&size_th, &hist_cur_th, new_type, f32_data, new_data, ne, nchunk, nthread_use](int tid) {
auto & local_size = size_th[tid];
auto & local_hist = hist_cur_th[tid];
for (int ch = tid; ch < nchunk; ch += nthread_use) {
const size_t first = ch * chunk_size;
const size_t last = std::min(ne, first + chunk_size);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
for (int it = 0; it < nthread_use - 1; ++it) {
workers.emplace_back(compute);
workers.emplace_back(compute, it);
}
compute();
for (auto & w : workers) { w.join(); }
compute(nthread_use - 1);
for (auto & w : workers) {
w.join();
}
workers.clear();
new_size = 0;
for (int it = 0; it < nthread_use; ++it) {
for (int j = 0; j < int(hist_cur.size()); ++j) {
hist_cur[j] += hist_cur_th[it][j];
}
new_size += size_th[it];
}
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -9113,7 +9231,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (tot_count > 0) {
for (size_t i = 0; i < hist_cur.size(); i++) {
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(ne));
}
}
LLAMA_LOG_INFO("\n");
@@ -9627,8 +9745,8 @@ struct llama_context * llama_new_context_with_model(
const ggml_type type_k = params.type_k;
const ggml_type type_v = params.type_v;
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
// reserve memory for context buffers
if (!hparams.vocab_only) {
@@ -10172,9 +10290,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams;
const auto n_layer = hparams.n_layer;
const auto n_embd = hparams.n_embd_gqa();
const auto n_ctx = cparams.n_ctx;
const auto n_layer = hparams.n_layer;
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
const auto n_ctx = cparams.n_ctx;
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
const uint32_t kv_head = kv_self.head;
@@ -10196,15 +10315,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
std::vector<struct ggml_tensor *> vout2d(n_layer);
for (int il = 0; il < (int) n_layer; ++il) {
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
n_embd, kv_head,
elt_size*n_embd, 0);
n_embd_k_gqa, kv_head,
elt_size*n_embd_k_gqa, 0);
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
kv_head, n_embd,
kv_head, n_embd_v_gqa,
elt_size*n_ctx, 0);
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@@ -10311,9 +10430,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd_gqa();
const int n_ctx = cparams.n_ctx;
const int n_layer = hparams.n_layer;
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
const int n_ctx = cparams.n_ctx;
size_t kv_buf_size;
uint32_t kv_head;
@@ -10337,15 +10457,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
std::vector<struct ggml_tensor *> vin2d(n_layer);
for (int il = 0; il < n_layer; ++il) {
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
n_embd, kv_head,
elt_size*n_embd, 0);
n_embd_k_gqa, kv_head,
elt_size*n_embd_k_gqa, 0);
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
kv_head, n_embd,
kv_head, n_embd_v_gqa,
elt_size*n_ctx, 0);
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
+10 -17
View File
@@ -15,19 +15,18 @@
#include <thread>
#include <vector>
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
size_t size = ggml_nelements(tensor);
std::vector<float> data(size);
#if 0
std::default_random_engine generator(rd());
static std::default_random_engine generator(1234);
std::uniform_real_distribution<float> distribution(min, max);
for (size_t i = 0; i < size; i++) {
data[i] = distribution(generator);
}
#endif
#else
auto init_thread = [&](size_t start, size_t end) {
std::random_device rd;
std::default_random_engine generator(rd());
@@ -49,6 +48,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
for (auto & t : threads) {
t.join();
}
#endif
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
@@ -437,12 +437,11 @@ struct test_case {
double err = nmse(f1.data(), f2.data(), f1.size());
if (err > ud->max_err) {
printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
printf("\n");
for (int i = 0; i < f1.size(); i++) {
printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
}
printf("\n");
exit(1);
//for (int i = 0; i < (int) f1.size(); i++) {
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
//}
//printf("\n");
//exit(1);
ud->ok = false;
}
return true;
@@ -1460,14 +1459,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_dup());
//for (ggml_type type : all_types) {
// test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
//}
for (ggml_type type : { GGML_TYPE_Q4_1} ) {
for (int i = 0; i < 2048; ++i) {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {32, 1, 1, 1}));
}
for (ggml_type type : all_types) {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
}
test_cases.emplace_back(new test_cont());