forked from wylab/llama.cpp
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b5af7ad84f | |||
| 83e633c27e | |||
| 32866c5edd | |||
| 5d7002d437 | |||
| 26f3071d71 | |||
| 775ac8712a | |||
| 58ba655af0 | |||
| edd1ab7bc3 | |||
| 198ed7ebfc | |||
| d836174731 | |||
| 06f2a5d190 | |||
| c5239944ba | |||
| 1e9ae54cf2 | |||
| 7adedecbe3 | |||
| 356ea17e0f | |||
| a5c088d8c6 | |||
| 1e3900ebac | |||
| e39106c055 | |||
| 9fbda719de | |||
| 39d8bc71ed |
@@ -8,12 +8,13 @@
|
||||
pkgsCuda,
|
||||
...
|
||||
}:
|
||||
lib.optionalAttrs (system == "aarch64-linux") {
|
||||
packages =
|
||||
{
|
||||
legacyPackages =
|
||||
let
|
||||
caps.jetson-xavier = "7.2";
|
||||
caps.jetson-orin = "8.7";
|
||||
caps.jetson-nano = "5.3";
|
||||
caps.llamaPackagesXavier = "7.2";
|
||||
caps.llamaPackagesOrin = "8.7";
|
||||
caps.llamaPackagesTX2 = "6.2";
|
||||
caps.llamaPackagesNano = "5.3";
|
||||
|
||||
pkgsFor =
|
||||
cap:
|
||||
@@ -27,6 +28,12 @@
|
||||
};
|
||||
};
|
||||
in
|
||||
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
|
||||
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
||||
|
||||
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
||||
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
|
||||
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
|
||||
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
@@ -515,7 +515,6 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
name: Nix CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: List all flake outputs
|
||||
run: nix flake show --all-systems
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--flake
|
||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
||||
@@ -0,0 +1,22 @@
|
||||
name: update-flake-lock
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||
|
||||
jobs:
|
||||
lockfile:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@main
|
||||
- name: Update flake.lock
|
||||
uses: DeterminateSystems/update-flake-lock@main
|
||||
with:
|
||||
pr-title: "nix: update flake.lock"
|
||||
pr-labels: |
|
||||
nix
|
||||
pr-reviewers: philiptaron,SomeoneSerge
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@@ -1,23 +0,0 @@
|
||||
# Make the flake discoverable on https://flakestry.dev
|
||||
name: "Publish a flake to flakestry"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v?[0-9]+.[0-9]+.[0-9]+"
|
||||
- "v?[0-9]+.[0-9]+"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
publish-flake:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
@@ -0,0 +1,36 @@
|
||||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||
name: "Publish a flake to flakestry & flakehub"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
flakestry-publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
flakehub-publish:
|
||||
runs-on: "ubuntu-latest"
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: "actions/checkout@v4"
|
||||
with:
|
||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||
- uses: "DeterminateSystems/flakehub-push@main"
|
||||
with:
|
||||
visibility: "public"
|
||||
tag: "${{ inputs.tag }}"
|
||||
+31
-3
@@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
|
||||
@@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
set(GGML_HEADERS_METAL ggml-metal.h)
|
||||
@@ -173,6 +174,33 @@ if (LLAMA_METAL)
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
if (LLAMA_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
# xcrun -sdk macosx metallib ggml-metal.air -o ggml.metallib
|
||||
#
|
||||
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
|
||||
# disabling fast math is needed in order to pass tests/test-backend-ops
|
||||
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
|
||||
set(XC_FLAGS -fno-fast-math -fno-inline -g)
|
||||
if (LLAMA_QKK_64)
|
||||
set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
|
||||
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
|
||||
DEPENDS ggml-metal.metal
|
||||
COMMENT "Compiling Metal kernels"
|
||||
)
|
||||
|
||||
add_custom_target(
|
||||
ggml-metal ALL
|
||||
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml.metallib
|
||||
)
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
torch>=2.0.0
|
||||
torch>=2.1.1
|
||||
transformers>=4.32.0
|
||||
|
||||
@@ -30,6 +30,12 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA=""
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
@@ -81,8 +87,8 @@ function gg_run_ctest_debug {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
@@ -109,8 +115,8 @@ function gg_run_ctest_release {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
@@ -59,7 +59,7 @@ class Model:
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
|
||||
with ctx as model_part:
|
||||
for name in model_part.keys():
|
||||
|
||||
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
```
|
||||
|
||||
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
||||
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
|
||||
|
||||
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
||||
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
||||
|
||||
+134
-107
@@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// image data
|
||||
//
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// clip layers
|
||||
//
|
||||
@@ -204,16 +225,21 @@ struct clip_vision_model {
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
struct ggml_context * ctx;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
@@ -222,7 +248,7 @@ struct clip_ctx {
|
||||
ggml_allocr * compute_alloc = NULL;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
@@ -243,13 +269,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
//const int projection_dim = hparams.projection_dim;
|
||||
const float eps = hparams.eps;
|
||||
int batch_size = imgs->size;
|
||||
if(ctx->has_llava_projector) {
|
||||
if (ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
@@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
for (int k = 0; k < 3; k++) {
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
|
||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
@@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
new_clip->ctx = ggml_init(params);
|
||||
if (!new_clip->ctx) {
|
||||
new_clip->ctx_data = ggml_init(params);
|
||||
if (!new_clip->ctx_data) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
clip_free(new_clip);
|
||||
return nullptr;
|
||||
@@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
|
||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
@@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||
ggml_allocr_alloc(alloc, cur);
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
@@ -617,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
// load vision model
|
||||
auto & vision_model = new_clip->vision_model;
|
||||
auto & hparams = vision_model.hparams;
|
||||
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
|
||||
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
|
||||
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
|
||||
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
|
||||
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
|
||||
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
|
||||
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
|
||||
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
|
||||
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
|
||||
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
|
||||
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
|
||||
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
|
||||
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
||||
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
||||
|
||||
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
@@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
printf("v_n_layer %d\n", hparams.n_layer);
|
||||
}
|
||||
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
|
||||
vision_model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = vision_model.layers[il];
|
||||
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
|
||||
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
|
||||
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
|
||||
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
|
||||
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
|
||||
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
|
||||
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
|
||||
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
|
||||
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
|
||||
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
|
||||
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
||||
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
|
||||
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
|
||||
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
|
||||
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
|
||||
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
|
||||
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
|
||||
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
|
||||
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
|
||||
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
|
||||
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
|
||||
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
|
||||
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
|
||||
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
|
||||
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
|
||||
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
||||
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
|
||||
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
|
||||
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
|
||||
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -680,8 +707,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
new_clip->ctx_gguf = ctx;
|
||||
|
||||
// measure mem requirement and allocate
|
||||
// measure mem requirement and allocate
|
||||
{
|
||||
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
@@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return new_clip;
|
||||
}
|
||||
|
||||
clip_image_u8 * make_clip_image_u8() {
|
||||
auto img = new clip_image_u8();
|
||||
return img;
|
||||
struct clip_image_u8 * clip_image_u8_init() {
|
||||
return new clip_image_u8();
|
||||
}
|
||||
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
|
||||
|
||||
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
|
||||
struct clip_image_f32 * clip_image_f32_init() {
|
||||
return new clip_image_f32();
|
||||
}
|
||||
|
||||
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
|
||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->size = nx * ny * 3;
|
||||
img->data = new uint8_t[img->size]();
|
||||
memcpy(img->data, data, img->size);
|
||||
img->buf.resize(3 * nx * ny);
|
||||
memcpy(img->buf.data(), data, img->buf.size());
|
||||
}
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
@@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
|
||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||
return false;
|
||||
@@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||
|
||||
// normalize: x = (x - mean) / std
|
||||
// TODO: implement bicubic interpolation instead of linear.
|
||||
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||
|
||||
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
|
||||
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
|
||||
if (pad2square && img->nx != img->ny) {
|
||||
int longer_side = std::max(img->nx, img->ny);
|
||||
temp->nx = longer_side;
|
||||
temp->ny = longer_side;
|
||||
temp->size = 3 * longer_side * longer_side;
|
||||
temp->data = new uint8_t[temp->size]();
|
||||
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
temp->buf.resize(3 * longer_side * longer_side);
|
||||
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
|
||||
// fill with background color
|
||||
for (size_t i = 0; i < temp->size; i++) {
|
||||
temp->data[i] = bc[i % 3];
|
||||
for (size_t i = 0; i < temp->buf.size(); i++) {
|
||||
temp->buf[i] = bc[i % 3];
|
||||
}
|
||||
|
||||
// copy from the input image
|
||||
@@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
for (int x = 0; x < img->nx; x++) {
|
||||
const int i = 3 * (y * img->nx + x);
|
||||
const int j = 3 * (y * temp->nx + x);
|
||||
temp->data[j] = img->data[i];
|
||||
temp->data[j+1] = img->data[i+1];
|
||||
temp->data[j+2] = img->data[i+2];
|
||||
temp->buf[j] = img->buf[i];
|
||||
temp->buf[j+1] = img->buf[i+1];
|
||||
temp->buf[j+2] = img->buf[i+2];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->size = img->size;
|
||||
temp->data = new uint8_t[temp->size]();
|
||||
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->buf.resize(img->buf.size());
|
||||
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
||||
}
|
||||
|
||||
const int nx = temp->nx;
|
||||
@@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
|
||||
res->nx = nx2;
|
||||
res->ny = ny2;
|
||||
res->size = 3 * nx2 * ny2;
|
||||
res->data = new float[res->size]();
|
||||
res->buf.resize(3 * nx2 * ny2);
|
||||
|
||||
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
|
||||
|
||||
@@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
const int j10 = 3 * (y1 * nx + x0) + c;
|
||||
const int j11 = 3 * (y1 * nx + x1) + c;
|
||||
|
||||
const float v00 = temp->data[j00];
|
||||
const float v01 = temp->data[j01];
|
||||
const float v10 = temp->data[j10];
|
||||
const float v11 = temp->data[j11];
|
||||
const float v00 = temp->buf[j00];
|
||||
const float v01 = temp->buf[j01];
|
||||
const float v10 = temp->buf[j10];
|
||||
const float v11 = temp->buf[j11];
|
||||
|
||||
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
||||
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
||||
@@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
|
||||
const int i = 3 * (y * nx3 + x) + c;
|
||||
|
||||
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
||||
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx);
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
|
||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||
}
|
||||
|
||||
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
@@ -906,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
switch (itype) {
|
||||
case 2:
|
||||
type = GGML_TYPE_Q4_0;
|
||||
break;
|
||||
case 3:
|
||||
type = GGML_TYPE_Q4_1;
|
||||
break;
|
||||
case 6:
|
||||
type = GGML_TYPE_Q5_0;
|
||||
break;
|
||||
case 7:
|
||||
type = GGML_TYPE_Q5_1;
|
||||
break;
|
||||
case 8:
|
||||
type = GGML_TYPE_Q8_0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
|
||||
return false;
|
||||
case 2:
|
||||
type = GGML_TYPE_Q4_0;
|
||||
break;
|
||||
case 3:
|
||||
type = GGML_TYPE_Q4_1;
|
||||
break;
|
||||
case 6:
|
||||
type = GGML_TYPE_Q5_0;
|
||||
break;
|
||||
case 7:
|
||||
type = GGML_TYPE_Q5_1;
|
||||
break;
|
||||
case 8:
|
||||
type = GGML_TYPE_Q8_0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
|
||||
return false;
|
||||
};
|
||||
|
||||
auto ctx_clip = clip_model_load(fname_inp, 2);
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||
const auto & ctx_data = ctx_clip->ctx;
|
||||
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||
|
||||
auto ctx_out = gguf_init_empty();
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||
const auto & ctx_data = ctx_clip->ctx_data;
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
gguf_set_kv(ctx_out, ctx_src);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
||||
|
||||
+16
-32
@@ -35,31 +35,14 @@ struct clip_vision_hparams {
|
||||
float eps;
|
||||
};
|
||||
|
||||
/** load mmproj model */
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
||||
/** free mmproj model */
|
||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
int clip_n_patches(const struct clip_ctx * ctx);
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
uint8_t * data = NULL;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
float * data = NULL;
|
||||
size_t size;
|
||||
};
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
struct clip_image_u8 * data;
|
||||
@@ -71,21 +54,22 @@ struct clip_image_f32_batch {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct clip_image_u8 * make_clip_image_u8();
|
||||
struct clip_image_f32 * make_clip_image_f32();
|
||||
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
|
||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
|
||||
float * vec);
|
||||
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "base64.hpp"
|
||||
|
||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||
clip_image_f32 * img_res = make_clip_image_f32();
|
||||
clip_image_f32 * img_res = clip_image_f32_init();
|
||||
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
|
||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||
clip_image_f32_free(img_res);
|
||||
@@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
}
|
||||
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
||||
clip_image_u8 * img = make_clip_image_u8();
|
||||
clip_image_u8 * img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||
clip_image_u8_free(img);
|
||||
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
|
||||
+68
-21
@@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c)
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
|
||||
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
@@ -209,10 +209,10 @@ struct slot_image
|
||||
int32_t id;
|
||||
|
||||
bool request_encode_image = false;
|
||||
float* image_embedding = nullptr;
|
||||
float * image_embedding = nullptr;
|
||||
int32_t image_tokens = 0;
|
||||
|
||||
clip_image_u8 img_data;
|
||||
clip_image_u8 * img_data;
|
||||
|
||||
std::string prefix_prompt; // before of this image
|
||||
};
|
||||
@@ -434,10 +434,12 @@ struct llama_client_slot
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image &img : images)
|
||||
for (slot_image & img : images)
|
||||
{
|
||||
free(img.image_embedding);
|
||||
delete[] img.img_data.data;
|
||||
if (img.img_data) {
|
||||
clip_image_u8_free(img.img_data);
|
||||
}
|
||||
img.prefix_prompt = "";
|
||||
}
|
||||
|
||||
@@ -851,24 +853,17 @@ struct llama_server_context
|
||||
{
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
std::string data_b64 = img["data"].get<std::string>();
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
slot_image img_sl;
|
||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||
int width, height, channels;
|
||||
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
|
||||
data_b64.clear();
|
||||
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
|
||||
if (!data) {
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
|
||||
img_sl.img_data.nx = width;
|
||||
img_sl.img_data.ny = height;
|
||||
img_sl.img_data.size = width * height * 3;
|
||||
img_sl.img_data.data = new uint8_t[width * height * 3]();
|
||||
memcpy(img_sl.img_data.data, data, width * height * 3);
|
||||
stbi_image_free(data);
|
||||
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
@@ -1143,8 +1138,8 @@ struct llama_server_context
|
||||
{
|
||||
continue;
|
||||
}
|
||||
clip_image_f32 img_res;
|
||||
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
|
||||
clip_image_f32 * img_res = clip_image_f32_init();
|
||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
|
||||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
clip_free(clp_ctx);
|
||||
@@ -1159,11 +1154,12 @@ struct llama_server_context
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
return false;
|
||||
}
|
||||
clip_image_f32_free(img_res);
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
|
||||
@@ -2020,6 +2016,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf("\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
@@ -2383,6 +2383,49 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
log_set_target(stdout);
|
||||
LOG_INFO("logging to file is disabled.", {});
|
||||
}
|
||||
else if (arg == "--override-kv")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
char * sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
@@ -2390,6 +2433,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
if (invalid_param)
|
||||
{
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1703559957,
|
||||
"narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=",
|
||||
"lastModified": 1703637592,
|
||||
"narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380",
|
||||
"rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -6,6 +6,29 @@
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
};
|
||||
|
||||
# Optional binary cache
|
||||
nixConfig = {
|
||||
extra-substituters = [
|
||||
# Populated by the CI in ggerganov/llama.cpp
|
||||
"https://llama-cpp.cachix.org"
|
||||
|
||||
# A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
# Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||
# This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||
# TODO: Replace once nix-community obtains an official one.
|
||||
"https://cuda-maintainers.cachix.org"
|
||||
];
|
||||
|
||||
# Verify these are the same keys as published on
|
||||
# - https://app.cachix.org/cache/llama-cpp
|
||||
# - https://app.cachix.org/cache/cuda-maintainers
|
||||
extra-trusted-public-keys = [
|
||||
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||
];
|
||||
};
|
||||
|
||||
|
||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||
#
|
||||
# ```bash
|
||||
@@ -74,26 +97,48 @@
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
system,
|
||||
pkgs,
|
||||
pkgsCuda,
|
||||
pkgsRocm,
|
||||
...
|
||||
}:
|
||||
{
|
||||
# Unlike `.#packages`, legacyPackages may contain values of
|
||||
# arbitrary types (including nested attrsets) and may even throw
|
||||
# exceptions. This attribute isn't recursed into by `nix flake
|
||||
# show` either.
|
||||
#
|
||||
# You can add arbitrary scripts to `.devops/nix/scope.nix` and
|
||||
# access them as `nix build .#llamaPackages.${scriptName}` using
|
||||
# the same path you would with an overlay.
|
||||
legacyPackages = {
|
||||
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||
};
|
||||
|
||||
# We don't use the overlay here so as to avoid making too many instances of nixpkgs,
|
||||
# cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
packages =
|
||||
{
|
||||
default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
|
||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||
}
|
||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||
};
|
||||
|
||||
# Packages exposed in `.#checks` will be built by the CI and by
|
||||
# `nix flake check`. Currently we expose all packages, but we could
|
||||
# make more granular choices
|
||||
checks = config.packages;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
+24
-23
@@ -119,10 +119,29 @@
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define CC_VOLTA 700
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||
|
||||
#define GGML_CUDA_MAX_NODES 8192
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@@ -189,23 +208,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ) && (!defined(GGML_USE_HIPBLAS) || defined(RDNA3))
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
@@ -8661,13 +8663,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
const bool fp16_performance_good = true;
|
||||
|
||||
#ifdef RDNA3
|
||||
const bool use_mul_mat_q = false;
|
||||
#else
|
||||
const bool use_mul_mat_q = true;
|
||||
#endif // RDNA3
|
||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
|
||||
#else
|
||||
|
||||
|
||||
+16
-10
@@ -257,13 +257,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
NSError * error = nil;
|
||||
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"];
|
||||
if (libPath != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * sourcePath;
|
||||
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
@@ -291,8 +292,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||
#endif
|
||||
// disable fast math
|
||||
// try to disable fast-math
|
||||
// NOTE: this seems to have no effect whatsoever
|
||||
// instead, in order to disable fast-math, we have to build ggml.metallib from the command line
|
||||
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
// and go through the "pre-compiled library found" path above
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
@@ -1234,7 +1238,7 @@ void ggml_metal_graph_compute(
|
||||
// not sure how to avoid this
|
||||
// TODO: make a simpler cpy_bytes kernel
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@@ -1289,7 +1293,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
||||
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
@@ -1789,8 +1793,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
||||
// TODO: how to make this an array? read Metal docs
|
||||
for (int j = 0; j < n_as; ++j) {
|
||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
||||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||
|
||||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
@@ -1913,8 +1918,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
||||
// TODO: how to make this an array? read Metal docs
|
||||
for (int j = 0; j < n_as; ++j) {
|
||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
||||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||
|
||||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
@@ -2233,7 +2239,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
+265
-210
@@ -59,26 +59,26 @@ kernel void kernel_add(
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & nb00,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & nb03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & nb13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant int64_t & nb0,
|
||||
constant int64_t & nb1,
|
||||
constant int64_t & nb2,
|
||||
constant int64_t & nb3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int64_t & offs,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
@@ -109,26 +109,26 @@ kernel void kernel_mul(
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & nb00,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & nb03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & nb13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant int64_t & nb0,
|
||||
constant int64_t & nb1,
|
||||
constant int64_t & nb2,
|
||||
constant int64_t & nb3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -158,26 +158,26 @@ kernel void kernel_div(
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & nb00,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & nb03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & nb13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant int64_t & nb0,
|
||||
constant int64_t & nb1,
|
||||
constant int64_t & nb2,
|
||||
constant int64_t & nb3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -205,7 +205,7 @@ kernel void kernel_add_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb [[buffer(28)]],
|
||||
constant uint64_t & nb [[buffer(28)]],
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||
}
|
||||
@@ -214,7 +214,7 @@ kernel void kernel_mul_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb [[buffer(28)]],
|
||||
constant uint64_t & nb [[buffer(28)]],
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
||||
}
|
||||
@@ -223,7 +223,7 @@ kernel void kernel_div_row(
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
constant int64_t & nb [[buffer(28)]],
|
||||
constant uint64_t & nb [[buffer(28)]],
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] / src1[tpig % nb];
|
||||
}
|
||||
@@ -307,26 +307,26 @@ kernel void kernel_sum_rows(
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & nb00,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant int64_t & nb03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant int64_t & nb13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant int64_t & nb0,
|
||||
constant int64_t & nb1,
|
||||
constant int64_t & nb2,
|
||||
constant int64_t & nb3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tpig[[thread_position_in_grid]]) {
|
||||
int64_t i3 = tpig.z;
|
||||
int64_t i2 = tpig.y;
|
||||
@@ -920,14 +920,21 @@ kernel void kernel_mul_mv_q4_0_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -939,14 +946,21 @@ kernel void kernel_mul_mv_q4_1_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -958,14 +972,21 @@ kernel void kernel_mul_mv_q5_0_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -977,14 +998,21 @@ kernel void kernel_mul_mv_q5_1_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -1071,12 +1099,19 @@ kernel void kernel_mul_mv_q8_0_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -1182,8 +1217,8 @@ kernel void kernel_mul_mv_f32_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
@@ -1209,8 +1244,8 @@ kernel void kernel_mul_mv_f16_f16(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
@@ -1346,8 +1381,8 @@ kernel void kernel_mul_mv_f16_f32_1row(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
@@ -1452,8 +1487,8 @@ kernel void kernel_mul_mv_f16_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
@@ -1478,8 +1513,8 @@ kernel void kernel_mul_mv_f16_f32_l4(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
@@ -1543,7 +1578,8 @@ kernel void kernel_alibi_f32(
|
||||
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
const int64_t k = i3*ne3 + i2;
|
||||
|
||||
float m_k;
|
||||
@@ -2410,22 +2446,6 @@ typedef struct {
|
||||
} block_q6_K;
|
||||
// 210 bytes / block
|
||||
|
||||
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
uchar4 r;
|
||||
if (j < 4) {
|
||||
r[0] = q[j+0] & 63;
|
||||
r[2] = q[j+1] & 63;
|
||||
r[1] = q[j+4] & 63;
|
||||
r[3] = q[j+5] & 63;
|
||||
} else {
|
||||
r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
||||
r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4);
|
||||
r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
void kernel_mul_mv_q2_K_f32_impl(
|
||||
@@ -2584,14 +2604,21 @@ kernel void kernel_mul_mv_q2_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -2841,14 +2868,21 @@ kernel void kernel_mul_mv_q3_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -2984,8 +3018,8 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int ix = tiisg/4; // 0...7
|
||||
const int it = tiisg%4; // 0...3
|
||||
@@ -2994,7 +3028,7 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int first_row = r0 * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
@@ -3060,7 +3094,7 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum;
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3072,14 +3106,21 @@ kernel void kernel_mul_mv_q4_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -3271,14 +3312,21 @@ kernel void kernel_mul_mv_q5_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -3398,14 +3446,21 @@ kernel void kernel_mul_mv_q6_K_f32(
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & r2 [[buffer(17)]],
|
||||
constant uint & r3 [[buffer(18)]],
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
@@ -3523,7 +3578,7 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const half d = xb->d;
|
||||
|
||||
for (int i=0;i<16;i++) {
|
||||
for (int i = 0; i < 16; i++) {
|
||||
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
||||
}
|
||||
}
|
||||
@@ -3792,12 +3847,12 @@ void kernel_mul_mm_impl(device const uchar * src0,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
@@ -3924,12 +3979,12 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
@@ -3965,19 +4020,19 @@ kernel void kernel_mul_mm_id(
|
||||
device const uchar * ids,
|
||||
device const uchar * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4070,12 +4125,12 @@ typedef void (mat_mm_t)(
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
@@ -4104,19 +4159,19 @@ typedef void (mat_mm_id_t)(
|
||||
device const uchar * ids,
|
||||
device const uchar * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & nb01,
|
||||
constant int64_t & nb02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant int64_t & nb10,
|
||||
constant int64_t & nb11,
|
||||
constant int64_t & nb12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4153,7 +4208,7 @@ kernel void kernel_mul_mv_id_f32_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4169,7 +4224,7 @@ kernel void kernel_mul_mv_id_f32_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4222,7 +4277,7 @@ kernel void kernel_mul_mv_id_f16_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4238,7 +4293,7 @@ kernel void kernel_mul_mv_id_f16_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4291,7 +4346,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4307,7 +4362,7 @@ kernel void kernel_mul_mv_id_q8_0_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4354,7 +4409,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4370,7 +4425,7 @@ kernel void kernel_mul_mv_id_q4_0_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4417,7 +4472,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4433,7 +4488,7 @@ kernel void kernel_mul_mv_id_q4_1_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4480,7 +4535,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4496,7 +4551,7 @@ kernel void kernel_mul_mv_id_q5_0_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4543,7 +4598,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4559,7 +4614,7 @@ kernel void kernel_mul_mv_id_q5_1_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4606,7 +4661,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4622,7 +4677,7 @@ kernel void kernel_mul_mv_id_q2_K_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4669,7 +4724,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4685,7 +4740,7 @@ kernel void kernel_mul_mv_id_q3_K_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4732,7 +4787,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4748,7 +4803,7 @@ kernel void kernel_mul_mv_id_q4_K_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4795,7 +4850,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4811,7 +4866,7 @@ kernel void kernel_mul_mv_id_q5_K_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
@@ -4858,7 +4913,7 @@ kernel void kernel_mul_mv_id_q6_K_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device uchar * dst,
|
||||
constant int64_t & nbi1,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -4874,7 +4929,7 @@ kernel void kernel_mul_mv_id_q6_K_f32(
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & nb1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
|
||||
+61
-62
@@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1103,8 +1107,6 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
|
||||
}
|
||||
}
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
|
||||
static const int qk = QK4_1;
|
||||
|
||||
@@ -1112,9 +1114,6 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
|
||||
|
||||
const int nb = k / qk;
|
||||
|
||||
printf("d = %9f\n", GGML_FP16_TO_FP32(x[0].d));
|
||||
printf("m = %9f\n", GGML_FP16_TO_FP32(x[0].m));
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float m = GGML_FP16_TO_FP32(x[i].m);
|
||||
@@ -2486,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
@@ -2774,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
@@ -2941,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -3233,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||
@@ -3488,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -3603,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
// We use this macro instead of a function call because for some reason
|
||||
// the code runs 2-3% slower, even if the function is declared inline
|
||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||
|
||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||
@@ -3978,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
||||
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
||||
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
|
||||
sum += d * (isum1 + isum2);
|
||||
}
|
||||
@@ -4261,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -4278,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -4762,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
||||
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
|
||||
sum += d * isum;
|
||||
|
||||
@@ -5114,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
||||
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
|
||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||
}
|
||||
@@ -5454,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
||||
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
@@ -5727,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
||||
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
||||
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||
}
|
||||
|
||||
sumf += d * sumi - dmin * sumi_mins;
|
||||
@@ -6117,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
||||
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
||||
|
||||
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||
int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||
int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||
int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||
int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||
|
||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||
}
|
||||
@@ -6404,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -6431,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
scale += 4;
|
||||
}
|
||||
//sum += isum * d_all * y[i].d;
|
||||
@@ -6821,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
||||
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
|
||||
sum += isum * d_all * y[i].d;
|
||||
|
||||
|
||||
@@ -46,6 +46,8 @@ class Keys:
|
||||
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
||||
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
||||
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
||||
KEY_LENGTH = "{arch}.attention.key_length"
|
||||
VALUE_LENGTH = "{arch}.attention.value_length"
|
||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||
|
||||
|
||||
@@ -333,6 +333,12 @@ class GGUFWriter:
|
||||
def add_head_count_kv(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||
|
||||
def add_key_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_value_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_max_alibi_bias(self, bias: float) -> None:
|
||||
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
||||
|
||||
|
||||
@@ -68,7 +68,6 @@
|
||||
#include <initializer_list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <queue>
|
||||
#include <random>
|
||||
@@ -245,6 +244,8 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
||||
LLM_KV_ATTENTION_CLAMP_KQV,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH,
|
||||
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
||||
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||
|
||||
@@ -297,6 +298,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
|
||||
@@ -1284,6 +1287,8 @@ struct llama_hparams {
|
||||
uint32_t n_head_kv;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_ff;
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
@@ -1310,6 +1315,8 @@ struct llama_hparams {
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
||||
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
@@ -1331,12 +1338,12 @@ struct llama_hparams {
|
||||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_head() const {
|
||||
return n_embd/n_head;
|
||||
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
return n_embd_head_k * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_gqa() const {
|
||||
return n_embd/n_gqa();
|
||||
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
||||
return n_embd_head_v * n_head_kv;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1645,8 +1652,9 @@ static bool llama_kv_cache_init(
|
||||
uint32_t n_ctx,
|
||||
int n_gpu_layers,
|
||||
bool offload) {
|
||||
const uint32_t n_embd = hparams.n_embd_gqa();
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
cache.has_shift = false;
|
||||
|
||||
@@ -1677,8 +1685,8 @@ static bool llama_kv_cache_init(
|
||||
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
@@ -2672,6 +2680,12 @@ static void llm_load_hparams(
|
||||
// gpt-j n_rot = rotary_dim
|
||||
}
|
||||
|
||||
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||||
|
||||
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||||
|
||||
// arch-specific KVs
|
||||
switch (model.arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
@@ -3082,8 +3096,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
|
||||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
|
||||
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
@@ -3173,10 +3191,11 @@ static bool llm_load_tensors(
|
||||
|
||||
// create tensors for the weights
|
||||
{
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
switch (model.arch) {
|
||||
@@ -3202,7 +3221,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3270,7 +3292,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3318,7 +3343,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3368,7 +3396,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3420,7 +3451,11 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
@@ -3469,7 +3504,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3520,7 +3558,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3567,7 +3608,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3665,7 +3709,10 @@ static bool llm_load_tensors(
|
||||
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3714,7 +3761,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -3761,7 +3811,10 @@ static bool llm_load_tensors(
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
@@ -4000,8 +4053,8 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
// Persimmon: n_rot = n_embd_head/2
|
||||
// Other: n_rot = n_embd_head
|
||||
// Persimmon: n_rot = n_embd_head_k/2
|
||||
// Other: n_rot = n_embd_head_k
|
||||
static void llm_build_k_shift(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
@@ -4014,17 +4067,17 @@ static void llm_build_k_shift(
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
const llm_build_cb & cb) {
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
GGML_ASSERT(n_embd_head % n_rot == 0);
|
||||
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
||||
|
||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
||||
cb(K_shift, "K_shift", -1);
|
||||
@@ -4042,9 +4095,9 @@ static void llm_build_k_shift(
|
||||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
0),
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
@@ -4065,18 +4118,19 @@ static void llm_build_kv_store(
|
||||
int32_t kv_head,
|
||||
const llm_build_cb & cb,
|
||||
int64_t il) {
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
||||
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
||||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
||||
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
@@ -4226,20 +4280,20 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
float kq_scale,
|
||||
const llm_build_cb & cb,
|
||||
int il) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
cb(q, "q", il);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
n_embd_head_k, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
0);
|
||||
cb(k, "k", il);
|
||||
|
||||
@@ -4278,9 +4332,9 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_kv, n_embd_head, n_head_kv,
|
||||
n_kv, n_embd_head_v, n_head_kv,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
@@ -4290,7 +4344,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx, wo, cur);
|
||||
@@ -4317,8 +4371,10 @@ struct llm_build_context {
|
||||
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
||||
const int64_t n_head;
|
||||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head;
|
||||
const int64_t n_embd_gqa;
|
||||
const int64_t n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa;
|
||||
const int64_t n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa;
|
||||
const int64_t n_expert;
|
||||
const int64_t n_expert_used;
|
||||
|
||||
@@ -4360,8 +4416,10 @@ struct llm_build_context {
|
||||
n_ctx (cparams.n_ctx),
|
||||
n_head (hparams.n_head),
|
||||
n_head_kv (hparams.n_head_kv),
|
||||
n_embd_head (hparams.n_embd_head()),
|
||||
n_embd_gqa (hparams.n_embd_gqa()),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (hparams.n_expert_used),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
@@ -4404,6 +4462,8 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
@@ -4588,6 +4648,9 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -4705,6 +4768,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_falcon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -4824,6 +4892,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_starcoder() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
@@ -4920,7 +4993,12 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_persimmon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_rot = n_embd_head / 2;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
const int64_t n_rot = n_embd_head_k / 2;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
@@ -5129,6 +5207,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_refact() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5217,6 +5300,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_bloom() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5308,6 +5396,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_mpt() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5403,6 +5496,9 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_stablelm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5513,6 +5609,9 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_qwen() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5624,6 +5723,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_phi2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * attn_norm_output;
|
||||
struct ggml_tensor * ffn_output;
|
||||
@@ -5736,6 +5840,9 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_plamo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
@@ -5840,6 +5947,11 @@ struct llm_build_context {
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
@@ -8972,7 +9084,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
workers.reserve(nthread);
|
||||
std::mutex mutex;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
@@ -9046,7 +9157,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const size_t nelements = ggml_nelements(tensor);
|
||||
const size_t ne = ggml_nelements(tensor);
|
||||
|
||||
float * f32_data;
|
||||
|
||||
@@ -9055,53 +9166,60 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, ne, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
if (work.size() < nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
if (work.size() < ne * 4) {
|
||||
work.resize(ne * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
std::array<int64_t, 1 << 4> hist_cur = {};
|
||||
|
||||
static const int chunk_size = 32 * 512;
|
||||
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||
if (nthread_use < 2) {
|
||||
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
||||
} else {
|
||||
size_t counter = 0;
|
||||
new_size = 0;
|
||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
||||
std::array<int64_t, 1 << 4> local_hist = {};
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
size_t first = counter; counter += chunk_size;
|
||||
if (first >= nelements) {
|
||||
if (local_size > 0) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||
hist_cur[j] += local_hist[j];
|
||||
}
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
size_t last = std::min(nelements, first + chunk_size);
|
||||
{
|
||||
static const size_t chunk_size = 32*512;
|
||||
|
||||
const int nchunk = GGML_PAD(ne, chunk_size)/chunk_size;
|
||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||
|
||||
std::vector<size_t> size_th(nthread_use, 0);
|
||||
std::vector<std::array<int64_t, 1 << 4>> hist_cur_th(nthread_use);
|
||||
|
||||
auto compute = [&size_th, &hist_cur_th, new_type, f32_data, new_data, ne, nchunk, nthread_use](int tid) {
|
||||
auto & local_size = size_th[tid];
|
||||
auto & local_hist = hist_cur_th[tid];
|
||||
|
||||
for (int ch = tid; ch < nchunk; ch += nthread_use) {
|
||||
const size_t first = ch * chunk_size;
|
||||
const size_t last = std::min(ne, first + chunk_size);
|
||||
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||
}
|
||||
};
|
||||
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers.emplace_back(compute);
|
||||
workers.emplace_back(compute, it);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) { w.join(); }
|
||||
|
||||
compute(nthread_use - 1);
|
||||
|
||||
for (auto & w : workers) {
|
||||
w.join();
|
||||
}
|
||||
|
||||
workers.clear();
|
||||
|
||||
new_size = 0;
|
||||
for (int it = 0; it < nthread_use; ++it) {
|
||||
for (int j = 0; j < int(hist_cur.size()); ++j) {
|
||||
hist_cur[j] += hist_cur_th[it][j];
|
||||
}
|
||||
new_size += size_th[it];
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
@@ -9113,7 +9231,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
if (tot_count > 0) {
|
||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
||||
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(ne));
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("\n");
|
||||
@@ -9627,8 +9745,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
const ggml_type type_k = params.type_k;
|
||||
const ggml_type type_v = params.type_v;
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
// reserve memory for context buffers
|
||||
if (!hparams.vocab_only) {
|
||||
@@ -10172,9 +10290,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const auto n_layer = hparams.n_layer;
|
||||
const auto n_embd = hparams.n_embd_gqa();
|
||||
const auto n_ctx = cparams.n_ctx;
|
||||
const auto n_layer = hparams.n_layer;
|
||||
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const auto n_ctx = cparams.n_ctx;
|
||||
|
||||
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
||||
const uint32_t kv_head = kv_self.head;
|
||||
@@ -10196,15 +10315,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
||||
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
||||
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
||||
|
||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||
n_embd, kv_head,
|
||||
elt_size*n_embd, 0);
|
||||
n_embd_k_gqa, kv_head,
|
||||
elt_size*n_embd_k_gqa, 0);
|
||||
|
||||
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||
kv_head, n_embd,
|
||||
kv_head, n_embd_v_gqa,
|
||||
elt_size*n_ctx, 0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
||||
@@ -10311,9 +10430,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd_gqa();
|
||||
const int n_ctx = cparams.n_ctx;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int n_ctx = cparams.n_ctx;
|
||||
|
||||
size_t kv_buf_size;
|
||||
uint32_t kv_head;
|
||||
@@ -10337,15 +10457,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
||||
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
||||
|
||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||
n_embd, kv_head,
|
||||
elt_size*n_embd, 0);
|
||||
n_embd_k_gqa, kv_head,
|
||||
elt_size*n_embd_k_gqa, 0);
|
||||
|
||||
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||
kv_head, n_embd,
|
||||
kv_head, n_embd_v_gqa,
|
||||
elt_size*n_ctx, 0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
||||
|
||||
+10
-17
@@ -15,19 +15,18 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
|
||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||
size_t size = ggml_nelements(tensor);
|
||||
std::vector<float> data(size);
|
||||
|
||||
#if 0
|
||||
std::default_random_engine generator(rd());
|
||||
static std::default_random_engine generator(1234);
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = distribution(generator);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
auto init_thread = [&](size_t start, size_t end) {
|
||||
std::random_device rd;
|
||||
std::default_random_engine generator(rd());
|
||||
@@ -49,6 +48,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
for (auto & t : threads) {
|
||||
t.join();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
@@ -437,12 +437,11 @@ struct test_case {
|
||||
double err = nmse(f1.data(), f2.data(), f1.size());
|
||||
if (err > ud->max_err) {
|
||||
printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
|
||||
printf("\n");
|
||||
for (int i = 0; i < f1.size(); i++) {
|
||||
printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
||||
}
|
||||
printf("\n");
|
||||
exit(1);
|
||||
//for (int i = 0; i < (int) f1.size(); i++) {
|
||||
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
|
||||
//}
|
||||
//printf("\n");
|
||||
//exit(1);
|
||||
ud->ok = false;
|
||||
}
|
||||
return true;
|
||||
@@ -1460,14 +1459,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
|
||||
test_cases.emplace_back(new test_dup());
|
||||
|
||||
//for (ggml_type type : all_types) {
|
||||
// test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
//}
|
||||
|
||||
for (ggml_type type : { GGML_TYPE_Q4_1} ) {
|
||||
for (int i = 0; i < 2048; ++i) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {32, 1, 1, 1}));
|
||||
}
|
||||
for (ggml_type type : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
|
||||
Reference in New Issue
Block a user