mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
91 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1d735c0b4f | |||
| 5368ddda7a | |||
| 84a9bf2fc2 | |||
| 2016f07bd1 | |||
| 6602304814 | |||
| 66168204be | |||
| 4ba9d711ba | |||
| 00137157fc | |||
| fb28f4f80e | |||
| 37b9f0d29d | |||
| 6408210082 | |||
| aff9d107b0 | |||
| 35370ba945 | |||
| 8d66005763 | |||
| b9154ecff9 | |||
| 2db9ba1464 | |||
| 2f74c354c0 | |||
| 207c22ec2d | |||
| 7a395f67a7 | |||
| 971f245b3b | |||
| 12b17501e6 | |||
| 015022bb53 | |||
| b43d89e311 | |||
| 80f19b4186 | |||
| f8f820cc4d | |||
| 54a7272043 | |||
| 84778e9770 | |||
| 510676475f | |||
| daa422881a | |||
| eccc7a1602 | |||
| 0019279bb5 | |||
| b0c75ac9f9 | |||
| d6d2c2ab8c | |||
| 75afa0ae31 | |||
| c772d54926 | |||
| 81c7e64fc2 | |||
| 526739b879 | |||
| a25355e264 | |||
| e959d32b1c | |||
| 307bfa253d | |||
| 71e90e8813 | |||
| bc091a4dc5 | |||
| a4837577aa | |||
| e59ea539b8 | |||
| c94085df28 | |||
| e8a62631b3 | |||
| b6930ebc42 | |||
| 68b08f36d0 | |||
| 578754b315 | |||
| b2034c2b55 | |||
| 06bb53ad9b | |||
| 0c50923944 | |||
| fccf9cae83 | |||
| ec6c09d0fa | |||
| 8ac9f5d765 | |||
| 12e9158f25 | |||
| 5b1f13cb64 | |||
| 8b91d5355a | |||
| 0fed24c347 | |||
| 47ba87d0a4 | |||
| 1d2b613445 | |||
| eb420e1148 | |||
| cb79c2e7fa | |||
| fe92821ea9 | |||
| 459895c326 | |||
| e4bf72d631 | |||
| 8b9cc7cdd8 | |||
| 64eda5deb9 | |||
| fe5b78c896 | |||
| 11d07e1e69 | |||
| b0091ecc1e | |||
| 31f7803bc4 | |||
| 2391506ace | |||
| d3bd7193ba | |||
| d9a63b2f2e | |||
| 8ed71242f4 | |||
| 381603a775 | |||
| 65a69e6e1b | |||
| 47277d6d1d | |||
| 6e1c4cebdb | |||
| 0090950f67 | |||
| 7ecd780b1a | |||
| 7538246e7c | |||
| b32efad2bc | |||
| a19b5cef16 | |||
| 78a1ba0a4f | |||
| 2dabf759e7 | |||
| 1d343b4069 | |||
| 8ca6e1c3a4 | |||
| 656babd6c2 | |||
| a226bc7a9a |
@@ -21,7 +21,7 @@ COPY . .
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
||||
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
@@ -6,7 +6,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN yum install -y gcc g++ cmake make
|
||||
RUN yum install -y gcc g++ cmake make libcurl-devel
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
||||
|
||||
@@ -35,7 +35,7 @@ COPY . .
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -17,8 +17,8 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
# gfx906 is deprecated
|
||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
||||
|
||||
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||
ARG ROCM_DOCKER_ARCH=gfx1100
|
||||
ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||
#ARG ROCM_DOCKER_ARCH=gfx1100
|
||||
|
||||
# Set nvcc architectured
|
||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
@@ -40,7 +40,7 @@ WORKDIR /app
|
||||
COPY . .
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
|
||||
@@ -16,7 +16,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -601,8 +601,9 @@ jobs:
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
build-linux-cross:
|
||||
uses: ./.github/workflows/build-linux-cross.yml
|
||||
# Disabled for now due to sporadic issue syncing.
|
||||
# build-linux-cross:
|
||||
# uses: ./.github/workflows/build-linux-cross.yml
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
@@ -1766,16 +1767,17 @@ jobs:
|
||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
runs-on: ubuntu-24.04-arm
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
cann:
|
||||
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
|
||||
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
|
||||
device:
|
||||
- 'ascend910b3'
|
||||
build:
|
||||
- 'Release'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
container: ascendai/cann:${{ matrix.cann }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1784,7 +1786,7 @@ jobs:
|
||||
- name: Dependencies
|
||||
run: |
|
||||
yum update -y
|
||||
yum install -y git gcc gcc-c++ make cmake
|
||||
yum install -y git gcc gcc-c++ make cmake libcurl-devel
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
|
||||
@@ -36,13 +36,13 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
@@ -780,10 +780,6 @@ ifdef GGML_HIP
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
|
||||
|
||||
ifdef GGML_HIP_UMA
|
||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||
endif # GGML_HIP_UMA
|
||||
|
||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
|
||||
@@ -9,13 +9,6 @@
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
> [!IMPORTANT]
|
||||
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
|
||||
>
|
||||
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
|
||||
>
|
||||
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
|
||||
|
||||
## Recent API changes
|
||||
|
||||
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
|
||||
@@ -23,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
@@ -104,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
|
||||
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
|
||||
- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
@@ -247,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
|
||||
|
||||
## Building the project
|
||||
|
||||
@@ -265,7 +261,9 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
||||
|
||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
||||
|
||||
After downloading a model, use the CLI tools to run it locally - see below.
|
||||
|
||||
|
||||
+16
-4
@@ -41,6 +41,11 @@ COMMON_CMAKE_ARGS=(
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
@@ -325,21 +330,28 @@ combine_static_libraries() {
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v vtool &>/dev/null; then
|
||||
if command -v xcrun vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
|
||||
echo "Xcode version greater than 16.2, using visionOS."
|
||||
VISION_OS_BUILD_VERSION="visionos"
|
||||
else
|
||||
echo "Xcode version less than or equal to 16.2, using xros."
|
||||
VISION_OS_BUILD_VERSION="xros"
|
||||
fi
|
||||
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
|
||||
+11
-10
@@ -163,6 +163,8 @@ struct common_hf_file_res {
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#elif defined(_AIX)
|
||||
#include <sys/limits.h>
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
@@ -226,12 +228,13 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
#if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
@@ -542,7 +545,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::string res_str;
|
||||
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
|
||||
std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
@@ -657,13 +663,8 @@ static void common_params_handle_model(
|
||||
}
|
||||
}
|
||||
|
||||
std::string hf_endpoint = "https://huggingface.co/";
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
if (hf_endpoint_env) {
|
||||
hf_endpoint = hf_endpoint_env;
|
||||
if (hf_endpoint.back() != '/') hf_endpoint += '/';
|
||||
}
|
||||
model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
|
||||
// make sure model path is present (for caching purposes)
|
||||
if (model.path.empty()) {
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
@@ -2725,7 +2726,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.chat_template = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||
string_format(
|
||||
|
||||
+1
-1
@@ -1622,7 +1622,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
}
|
||||
|
||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||
}
|
||||
|
||||
|
||||
+17
-2
@@ -830,7 +830,7 @@ std::string fs_get_cache_directory() {
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
@@ -840,7 +840,9 @@ std::string fs_get_cache_directory() {
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
#else
|
||||
# error Unknown architecture
|
||||
#endif
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
}
|
||||
@@ -1027,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
|
||||
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
|
||||
std::string model_endpoint = "https://huggingface.co/";
|
||||
if (endpoint_env) {
|
||||
model_endpoint = endpoint_env;
|
||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
||||
}
|
||||
return model_endpoint;
|
||||
}
|
||||
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||
llama_clear_adapter_lora(ctx);
|
||||
for (auto & la : lora) {
|
||||
|
||||
@@ -543,6 +543,8 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
std::string get_model_endpoint();
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
//
|
||||
|
||||
+483
-296
File diff suppressed because it is too large
Load Diff
@@ -114,6 +114,7 @@ models = [
|
||||
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
@@ -340,11 +340,11 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
hparams = ModelBase.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -425,13 +425,13 @@ Examples:
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
@@ -697,13 +697,13 @@ Examples:
|
||||
- Use device 0:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
```
|
||||
|
||||
|
||||
|
||||
+4
-2
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build --config Release -- -j 16
|
||||
```
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
||||
|
||||
@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||
|
||||
### Unified Memory
|
||||
|
||||
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
## Vulkan
|
||||
|
||||
**Windows**
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# llava (legacy)
|
||||
|
||||
add_library(llava OBJECT
|
||||
llava.cpp
|
||||
llava.h
|
||||
@@ -22,27 +24,46 @@ if (BUILD_SHARED_LIBS)
|
||||
install(TARGETS llava_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
# mtmd
|
||||
|
||||
add_library(mtmd OBJECT
|
||||
mtmd.cpp
|
||||
mtmd.h
|
||||
clip.cpp
|
||||
clip.h
|
||||
clip-impl.h
|
||||
)
|
||||
|
||||
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
target_include_directories(mtmd PUBLIC .)
|
||||
target_include_directories(mtmd PRIVATE ../..)
|
||||
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
|
||||
|
||||
target_compile_features(mtmd PRIVATE cxx_std_17)
|
||||
|
||||
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
||||
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS mtmd_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(llava BUILD_INFO)
|
||||
add_dependencies(mtmd BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llama-llava-cli)
|
||||
add_executable(${TARGET} llava-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-minicpmv-cli)
|
||||
add_executable(${TARGET} minicpmv-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-qwen2vl-cli)
|
||||
add_executable(${TARGET} qwen2vl-cli.cpp)
|
||||
@@ -51,11 +72,11 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-gemma3-cli)
|
||||
add_executable(${TARGET} gemma3-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable(${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-llava-clip-quantize-cli)
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include "clip.h"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
@@ -7,6 +10,7 @@
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
@@ -46,7 +50,6 @@
|
||||
// tensor name constants
|
||||
//
|
||||
|
||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
@@ -62,8 +65,6 @@
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
@@ -120,6 +121,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
@@ -178,6 +196,36 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// cpp wrappers
|
||||
//
|
||||
|
||||
// wrapper for clip_image_size
|
||||
struct clip_image_size_deleter {
|
||||
void operator()(clip_image_size * val) { clip_image_size_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
||||
|
||||
// wrapper for clip_image_u8
|
||||
struct clip_image_u8_deleter {
|
||||
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
|
||||
|
||||
// wrapper for clip_image_f32
|
||||
struct clip_image_f32_deleter {
|
||||
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
std::vector<clip_image_u8_ptr> entries;
|
||||
};
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
std::vector<clip_image_f32_ptr> entries;
|
||||
};
|
||||
|
||||
//
|
||||
// common utils
|
||||
//
|
||||
@@ -214,6 +262,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
// split string by a `std::string delim` instead of `char delim`
|
||||
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
while ((pos = s.find(delimiter)) != std::string::npos) {
|
||||
token = s.substr(0, pos);
|
||||
tokens.push_back(token);
|
||||
s.erase(0, pos + delimiter.length());
|
||||
}
|
||||
tokens.push_back(s);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
//
|
||||
// gguf utils
|
||||
//
|
||||
@@ -271,3 +333,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
|
||||
projector_type clip_get_projector_type(const struct clip_ctx * ctx);
|
||||
|
||||
+626
-619
File diff suppressed because it is too large
Load Diff
+23
-16
@@ -30,19 +30,13 @@ struct clip_image_size {
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
struct clip_image_u8 * data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
struct clip_image_f32 * data;
|
||||
size_t size;
|
||||
};
|
||||
struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
ggml_log_level verbosity;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
@@ -55,9 +49,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||
|
||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||
|
||||
// TODO: should be enum, not string
|
||||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
@@ -73,15 +67,26 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
||||
|
||||
// nx, ny are the output image dimensions
|
||||
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||
|
||||
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
// use for accessing underlay data of clip_image_f32_batch
|
||||
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
||||
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
||||
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
||||
@@ -106,6 +111,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
size_t pos = filename.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
@@ -1,332 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||
static const char* IMG_BASE64_TAG_END = "\">";
|
||||
|
||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||
}
|
||||
|
||||
static bool prompt_contains_image(const std::string& prompt) {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
return (begin != std::string::npos);
|
||||
}
|
||||
|
||||
// replaces the base64 image tag in the prompt with `replacement`
|
||||
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||
|
||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
if (begin == std::string::npos || end == std::string::npos) {
|
||||
return prompt;
|
||||
}
|
||||
auto pre = prompt.substr(0, begin);
|
||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||
return pre + replacement + post;
|
||||
}
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\n example usage:\n");
|
||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||
|
||||
// load and preprocess the image
|
||||
llava_image_embed * embed = NULL;
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||
user_prompt = prompt + "\nASSISTANT:";
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
if (!smpl) {
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
LOG("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
common_sampler_free(smpl);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = llava_init(¶ms);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_model_free(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
+57
-46
@@ -10,6 +10,7 @@
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#if defined(LLAVA_LOG_OFF)
|
||||
# define LOG_INF(...)
|
||||
@@ -45,6 +46,17 @@ struct clip_image_grid_shape {
|
||||
int second;
|
||||
};
|
||||
|
||||
// convenience cpp wrapper
|
||||
struct clip_image_f32_batch_deleter {
|
||||
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
|
||||
|
||||
struct clip_image_size_deleter {
|
||||
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
@@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
const int32_t patch_size = clip_patch_size(ctx_clip);
|
||||
const int32_t image_size = clip_get_image_size(ctx_clip);
|
||||
const int32_t patch_size = clip_get_patch_size(ctx_clip);
|
||||
|
||||
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
||||
|
||||
@@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
|
||||
|
||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||
clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
|
||||
if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
|
||||
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
|
||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||
|
||||
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
|
||||
|
||||
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
image_embd_v.resize(n_imgs);
|
||||
clip_image_size load_image_size;
|
||||
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
for (size_t i = 0; i < n_imgs; i++) {
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||
int patch_size=14;
|
||||
load_image_size->width = img_res_v.data[i].nx;
|
||||
load_image_size->height = img_res_v.data[i].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
int patch_size = 14;
|
||||
load_image_size.width = nx;
|
||||
load_image_size.height = ny;
|
||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
||||
|
||||
bool encoded = false;
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
|
||||
}
|
||||
else {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
|
||||
}
|
||||
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
|
||||
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
std::memcpy(
|
||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||
image_embd_v[i],
|
||||
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
|
||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
load_image_size->width = img->nx;
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
load_image_size.width = img->nx;
|
||||
load_image_size.height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, &load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
|
||||
}
|
||||
else if (clip_is_glm(ctx_clip)){
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
load_image_size->width = img_res_v.data[0].nx;
|
||||
load_image_size->height = img_res_v.data[0].ny;
|
||||
load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
|
||||
load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
||||
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
|
||||
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
|
||||
*n_img_pos = (pos * pos + 2);
|
||||
if (!encoded){
|
||||
LOG_ERR("Unable to encode image \n");
|
||||
@@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
delete[] img_res_v.data;
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
||||
@@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
// spatial_unpad llava-1.6 type embedding
|
||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
image_embd_v.resize(n_imgs);
|
||||
for (size_t i = 0; i < n_imgs; i++) {
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
||||
@@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
||||
}
|
||||
|
||||
// free all img_res_v - not needed anymore
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
const int32_t image_size = clip_get_image_size(ctx_clip);
|
||||
|
||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
|
||||
@@ -1,354 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
}
|
||||
|
||||
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_model_free(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||
const char * clip_path = params->mmproj.path.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
struct clip_context_params clip_params = {
|
||||
/* use_gpu */ params->n_gpu_layers != 0,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
|
||||
};
|
||||
auto * ctx_clip = clip_init(clip_path, clip_params);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
}
|
||||
|
||||
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
llava_image_embed_free(slice_embed);
|
||||
}
|
||||
|
||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
|
||||
std::string system_prompt;
|
||||
int idx = 0;
|
||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx_llama);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
static std::string ret;
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
|
||||
auto * ctx_clip = clip_init_context(params);
|
||||
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto * ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (!is_first) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_INF("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
return smpl;
|
||||
}
|
||||
|
||||
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
|
||||
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) {
|
||||
int n_past = 0;
|
||||
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
LOG("<user>%s\n", params.prompt.c_str());
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
bool have_tmp = false;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0){
|
||||
if (!have_tmp) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
have_tmp = true;
|
||||
printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}else {
|
||||
while (true) {
|
||||
LOG("<user>");
|
||||
std::string prompt;
|
||||
std::getline(std::cin, prompt);
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
}
|
||||
common_sampler_free(smpl);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -2,11 +2,11 @@
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "stb_image.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "console.h"
|
||||
#include "chat.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
@@ -28,15 +28,16 @@ static bool g_is_generating = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying Gemma 3 vision capabilities.
|
||||
* It is a playground for trying multimodal support in llama.cpp.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Experimental CLI for using Gemma 3 vision model\n\n"
|
||||
"Experimental CLI for multimodal\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" -hf user/repo can replace both -m and --mmproj in most cases\n"
|
||||
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
||||
argv[0]
|
||||
);
|
||||
@@ -56,38 +57,77 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gemma3_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
common_init_result llama_init;
|
||||
struct mtmd_cli_context {
|
||||
mtmd_context_ptr ctx_vision;
|
||||
common_init_result llama_init;
|
||||
|
||||
llama_model * model;
|
||||
llama_context * lctx;
|
||||
const llama_vocab * vocab;
|
||||
llama_batch batch;
|
||||
int n_batch;
|
||||
|
||||
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
|
||||
// so here we don't need to keep track of chat history
|
||||
common_chat_templates_ptr tmpls;
|
||||
|
||||
// support for legacy templates (models not having EOT token)
|
||||
llama_tokens antiprompt_tokens;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model.get();
|
||||
lctx = llama_init.context.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
n_threads = params.cpuparams.n_threads;
|
||||
batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
init_clip_model(params);
|
||||
n_batch = params.n_batch;
|
||||
|
||||
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
|
||||
LOG_ERR("Model does not have chat template.\n");
|
||||
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
|
||||
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
tmpls = common_chat_templates_init(model, params.chat_template);
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
|
||||
|
||||
init_vision_context(params);
|
||||
|
||||
// load antiprompt tokens for legacy templates
|
||||
if (params.chat_template == "vicuna") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
|
||||
} else if (params.chat_template == "deepseek") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "###", false, true);
|
||||
}
|
||||
}
|
||||
|
||||
void init_clip_model(common_params & params) {
|
||||
void init_vision_context(common_params & params) {
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
|
||||
if (!ctx_clip) {
|
||||
LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
|
||||
/* use_gpu */ true,
|
||||
/* timings */ true,
|
||||
/* n_threads */ params.cpuparams.n_threads,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO,
|
||||
}));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
~gemma3_context() {
|
||||
clip_free(ctx_clip);
|
||||
bool check_antiprompt(const llama_tokens & generated_tokens) {
|
||||
if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(
|
||||
generated_tokens.end() - antiprompt_tokens.size(),
|
||||
generated_tokens.end(),
|
||||
antiprompt_tokens.begin()
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -124,78 +164,8 @@ struct decode_embd_batch {
|
||||
}
|
||||
};
|
||||
|
||||
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
||||
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
||||
common_batch_clear(ctx.batch);
|
||||
for (llama_token & t : tokens) {
|
||||
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
||||
}
|
||||
if (logits_last) {
|
||||
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
||||
}
|
||||
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("Failed to decode text\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
||||
std::vector<float> image_embd_v;
|
||||
int n_embd = llama_model_n_embd(ctx.model);
|
||||
int n_tokens = 256;
|
||||
image_embd_v.resize(n_tokens * n_embd);
|
||||
|
||||
bool ok;
|
||||
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
||||
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
||||
clip_image_u8_free(img_u8);
|
||||
return 2; // non-fatal error
|
||||
}
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
LOG("Encoding image %s\n", fname.c_str());
|
||||
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
return 1;
|
||||
}
|
||||
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
|
||||
clip_image_f32_batch_free(&batch_f32);
|
||||
clip_image_u8_free(img_u8);
|
||||
|
||||
// decode image embeddings
|
||||
int64_t t1 = ggml_time_ms();
|
||||
eval_text(ctx, "<start_of_image>");
|
||||
llama_set_causal_attn(ctx.lctx, false);
|
||||
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
||||
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
return 1;
|
||||
}
|
||||
ctx.n_past += n_tokens;
|
||||
llama_set_causal_attn(ctx.lctx, true);
|
||||
eval_text(ctx, "<end_of_image>");
|
||||
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
llama_tokens generated_tokens;
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating) {
|
||||
printf("\n");
|
||||
@@ -203,9 +173,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
||||
generated_tokens.push_back(token_id);
|
||||
common_sampler_accept(smpl, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
|
||||
printf("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
@@ -224,6 +195,46 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
|
||||
std::vector<mtmd_bitmap> bitmaps;
|
||||
|
||||
common_chat_templates_inputs tmpl_inputs;
|
||||
tmpl_inputs.messages = {msg};
|
||||
tmpl_inputs.add_generation_prompt = true;
|
||||
tmpl_inputs.use_jinja = false; // jinja is buggy here
|
||||
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
|
||||
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
|
||||
|
||||
for (auto & fname : images_fname) {
|
||||
mtmd_bitmap bitmap;
|
||||
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
|
||||
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
||||
return 2; // image not found
|
||||
}
|
||||
bitmaps.push_back(std::move(bitmap));
|
||||
}
|
||||
|
||||
mtmd_input_text text;
|
||||
text.text = formatted_chat.prompt;
|
||||
text.add_special = add_bos;
|
||||
text.parse_special = true;
|
||||
mtmd_input_chunks chunks;
|
||||
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
|
||||
if (res != 0) {
|
||||
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
|
||||
LOG_ERR("Unable to eval prompt\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
@@ -241,7 +252,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gemma3_context ctx(params);
|
||||
mtmd_cli_context ctx(params);
|
||||
printf("%s: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
@@ -265,21 +276,15 @@ int main(int argc, char ** argv) {
|
||||
#endif
|
||||
}
|
||||
|
||||
if (eval_text(ctx, "<bos>")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
if (params.prompt.find("<__image__>") == std::string::npos) {
|
||||
params.prompt += " <__image__>";
|
||||
}
|
||||
for (auto & fname : params.image) {
|
||||
if (eval_image(ctx, fname)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = params.prompt;
|
||||
if (eval_message(ctx, msg, params.image, true)) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
@@ -293,9 +298,9 @@ int main(int argc, char ** argv) {
|
||||
LOG("\n /quit or /exit exit the program");
|
||||
LOG("\n");
|
||||
|
||||
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
bool is_first_msg = true;
|
||||
std::vector<std::string> images_fname;
|
||||
std::string content;
|
||||
|
||||
while (true) {
|
||||
g_is_generating = false;
|
||||
@@ -320,26 +325,33 @@ int main(int argc, char ** argv) {
|
||||
g_is_generating = true;
|
||||
if (line.find("/image") == 0) {
|
||||
std::string image = line.substr(7);
|
||||
int res = eval_image(ctx, image);
|
||||
if (res == 2) {
|
||||
continue; // image not found
|
||||
}
|
||||
if (res) {
|
||||
return 1;
|
||||
}
|
||||
images_fname.push_back(string_strip(image));
|
||||
content += "<__image__>";
|
||||
continue;
|
||||
} else {
|
||||
content += line;
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = content;
|
||||
int ret = eval_message(ctx, msg, images_fname, is_first_msg);
|
||||
if (ret == 2) {
|
||||
// non-fatal error
|
||||
images_fname.clear();
|
||||
content.clear();
|
||||
continue;
|
||||
}
|
||||
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
||||
if (ret) {
|
||||
return 1;
|
||||
}
|
||||
if (generate_response(ctx, smpl, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
||||
return 1;
|
||||
}
|
||||
images_fname.clear();
|
||||
content.clear();
|
||||
is_first_msg = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_perf_context_print(ctx.lctx);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,586 @@
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
|
||||
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
|
||||
enum mtmd_slice_tmpl {
|
||||
MTMD_SLICE_TMPL_NONE,
|
||||
MTMD_SLICE_TMPL_MINICPMV_2_5,
|
||||
MTMD_SLICE_TMPL_MINICPMV_2_6,
|
||||
// TODO @ngxson : add support for idefics (SmolVLM)
|
||||
};
|
||||
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_clip;
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string image_marker;
|
||||
|
||||
// for minicpmv, we need special tokens in-between slices
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
|
||||
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
|
||||
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
|
||||
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
|
||||
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
|
||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params) :
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
image_marker (ctx_params.image_marker)
|
||||
{
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
||||
if (!ctx_clip) {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
this->text_model = text_model;
|
||||
|
||||
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
|
||||
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = lookup_token("<image>");
|
||||
tok_ov_img_end = lookup_token("</image>");
|
||||
tok_slices_start = lookup_token("<slice>");
|
||||
tok_slices_end = lookup_token("</slice>");
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = lookup_token("\n");
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = lookup_token("<image>");
|
||||
tok_ov_img_end = lookup_token("</image>");
|
||||
tok_sli_img_start = lookup_token("<slice>");
|
||||
tok_sli_img_end = lookup_token("</slice>");
|
||||
tok_row_end = lookup_token("\n");
|
||||
|
||||
} else if (minicpmv_version != 0) {
|
||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
||||
}
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_clip);
|
||||
}
|
||||
|
||||
private:
|
||||
llama_token lookup_token(const std::string & token_text) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(text_model);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
if (token_to_piece(vocab, i, true) == token_text) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
|
||||
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
} else {
|
||||
piece.resize(n_chars);
|
||||
}
|
||||
return piece;
|
||||
}
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens_data {
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
};
|
||||
|
||||
mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const struct llama_model * text_model,
|
||||
const struct mtmd_context_params ctx_params) {
|
||||
try {
|
||||
return new mtmd_context(mmproj_fname, text_model, ctx_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_free(mtmd_context * ctx) {
|
||||
if (ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
std::vector<mtmd_input_chunk> & output,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps) {
|
||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
std::string prompt_modified(text.text);
|
||||
std::string marker_modified(ctx->image_marker);
|
||||
// a bit hacky here, but works for now
|
||||
// for some models, we need to add prefix and suffix to the image embeddings
|
||||
if (clip_is_gemma3(ctx->ctx_clip)) {
|
||||
// gemma 3
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, we don't need to add because the tokens are already in the returned embeddings
|
||||
|
||||
// TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
|
||||
|
||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
||||
output.clear();
|
||||
output.reserve(parts.size());
|
||||
|
||||
size_t i_img = 0;
|
||||
|
||||
// utility for adding raw tokens
|
||||
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
{},
|
||||
};
|
||||
output.emplace_back(std::move(chunk));
|
||||
};
|
||||
|
||||
// utility for splitting batch of multiple images into chunks of batch having single images
|
||||
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
std::vector<mtmd_input_chunk> chunks;
|
||||
|
||||
for (auto & entry : batch_f32.entries) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip);
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->id = id;
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
std::move(image_tokens),
|
||||
};
|
||||
chunks.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
};
|
||||
|
||||
for (const auto & part : parts) {
|
||||
//printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
||||
if (tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
{},
|
||||
};
|
||||
output.emplace_back(std::move(chunk));
|
||||
|
||||
if (&parts.back() != &part) {
|
||||
// add image token to middle of 2 parts
|
||||
|
||||
if (i_img >= bitmaps.size()) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmaps[i_img].nx;
|
||||
img_u8->ny = bitmaps[i_img].ny;
|
||||
img_u8->buf.resize(bitmaps[i_img].data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
// add overview image
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
output.emplace_back(std::move(chunks.front()));
|
||||
chunks.erase(chunks.begin());
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
|
||||
// add slices
|
||||
if (!chunks.empty()) {
|
||||
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
|
||||
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
|
||||
int n_row = (int)chunks.size() / n_col;
|
||||
GGML_ASSERT(n_row * n_col == (int)chunks.size());
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_start});
|
||||
}
|
||||
output.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
|
||||
add_text_chunk({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
|
||||
image_tokens->ny = 1; // TODO
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmaps[i_img].id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
if (clip_is_glm(ctx->ctx_clip)) {
|
||||
// glm-edge
|
||||
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
|
||||
}
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
std::move(image_tokens),
|
||||
};
|
||||
output.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_img++; // move to next image
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
|
||||
if (image_tokens) {
|
||||
delete image_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->n_tokens();
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->nx;
|
||||
}
|
||||
|
||||
size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->ny;
|
||||
}
|
||||
|
||||
std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->id;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
bool ok = false;
|
||||
|
||||
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
|
||||
{
|
||||
clip_image_size slice_size{
|
||||
image_tokens->batch_f32.entries[0]->nx,
|
||||
image_tokens->batch_f32.entries[0]->ny};
|
||||
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
|
||||
}
|
||||
|
||||
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_patches(ctx->ctx_clip);
|
||||
ok = clip_image_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
||||
}
|
||||
} else {
|
||||
ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
}
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
return ctx->image_embd_v.data();
|
||||
}
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (auto & chunk : chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
n_tokens += chunk.tokens_text.size();
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
n_tokens += chunk.tokens_image->n_tokens();
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks & chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch) {
|
||||
int32_t ret;
|
||||
llama_pos n_past = pos0;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
|
||||
for (auto & chunk : chunks) {
|
||||
bool is_last = &chunk == &chunks.back();
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
text_batch.n_tokens = chunk.tokens_text.size();
|
||||
size_t i = 0;
|
||||
while (i < chunk.tokens_text.size()) { // split into batches
|
||||
for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
|
||||
text_batch.token [i] = chunk.tokens_text[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
if (is_last) {
|
||||
// always get logits for last input chunk
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
|
||||
GGML_ASSERT(chunk.tokens_image != nullptr);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("encoding image or slice...\n");
|
||||
}
|
||||
ret = mtmd_encode(ctx, chunk.tokens_image.get());
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode image\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
}
|
||||
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int32_t pos_offset = i_batch*n_batch;
|
||||
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
float * embd_batch = embd + pos_offset*n_mmproj_embd;
|
||||
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
|
||||
|
||||
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ctx->print_timings) {
|
||||
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
}
|
||||
|
||||
i_batch++;
|
||||
n_past += n_tokens_batch;
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(text_batch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image from buffer\n");
|
||||
return 1;
|
||||
}
|
||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
||||
output.data.resize(output.nx * output.ny * 3);
|
||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
bool ok = clip_image_load_from_file(fname, img_u8.get());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image %s\n", fname);
|
||||
return 1;
|
||||
}
|
||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
|
||||
output.data.resize(output.nx * output.ny * 3);
|
||||
std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||
mtmd_image_tokens_free(val);
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
#ifndef MTMD_H
|
||||
#define MTMD_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define MTMD_API __declspec(dllexport)
|
||||
# else
|
||||
# define MTMD_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define MTMD_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
enum mtmd_input_chunk_type {
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
};
|
||||
|
||||
struct mtmd_context;
|
||||
struct mtmd_image_tokens;
|
||||
|
||||
// represents raw image data, layout is RGBRGBRGB...
|
||||
// length of data must be nx * ny * 3
|
||||
struct mtmd_bitmap {
|
||||
uint32_t nx;
|
||||
uint32_t ny;
|
||||
std::vector<unsigned char> data;
|
||||
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens_deleter {
|
||||
void operator()(mtmd_image_tokens * val); // forward declaration
|
||||
};
|
||||
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
|
||||
|
||||
struct mtmd_input_chunk {
|
||||
mtmd_input_chunk_type type;
|
||||
std::vector<llama_token> tokens_text;
|
||||
mtmd_image_tokens_ptr tokens_image;
|
||||
};
|
||||
|
||||
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu = true;
|
||||
bool print_timings = true;
|
||||
int n_threads = 4;
|
||||
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
|
||||
const char * image_marker = "<__image__>";
|
||||
};
|
||||
|
||||
struct mtmd_input_text {
|
||||
std::string text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
};
|
||||
|
||||
// initialize the mtmd context
|
||||
// return nullptr on failure
|
||||
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params ctx_params);
|
||||
|
||||
MTMD_API void mtmd_free(mtmd_context * ctx);
|
||||
|
||||
// tokenize an input text prompt and an image
|
||||
// the prompt must have the input image marker (default: "<__image__>") in it
|
||||
// the marker will be replaced with the image tokens
|
||||
// for example:
|
||||
// "here is an image: <__image__>\ndescribe it in detail."
|
||||
// this will gives 3 chunks:
|
||||
// 1. "here is an image: <start_of_image>"
|
||||
// 2. (image tokens)
|
||||
// 3. "<end_of_image>\ndescribe it in detail."
|
||||
// number of bitmaps must be equal to the number of image markers in the prompt
|
||||
// this function is thread-safe (shared ctx)
|
||||
// return values:
|
||||
// 0 on success
|
||||
// 1 on number of images not matching the number of markers
|
||||
// 2 on image preprocessing error
|
||||
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
std::vector<mtmd_input_chunk> & output,
|
||||
const mtmd_input_text & text,
|
||||
const std::vector<mtmd_bitmap> & bitmaps);
|
||||
|
||||
// access mtmd_image_tokens
|
||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
||||
|
||||
// returns 0 on success
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
const mtmd_image_tokens * image_tokens);
|
||||
|
||||
// get output embeddings from the last encode pass
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||
|
||||
|
||||
|
||||
//
|
||||
// helper functions (can be implemented based on other functions)
|
||||
//
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_context * lctx,
|
||||
mtmd_input_chunks & chunks,
|
||||
llama_pos pos0,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// returns 0 on success
|
||||
// this function is thread-safe
|
||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer
|
||||
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
|
||||
// returns 0 on success
|
||||
// this function is thread-safe
|
||||
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
|
||||
|
||||
// convenient unique_ptr wrappers
|
||||
struct mtmd_context_deleter {
|
||||
void operator()(mtmd_context * val) { mtmd_free(val); }
|
||||
};
|
||||
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
||||
|
||||
#else
|
||||
|
||||
static_assert(false && "C header is not yet supported by this library");
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
+23
-11
@@ -17,26 +17,30 @@ cd $PROJ_ROOT
|
||||
|
||||
arr_bin=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
|
||||
add_test() {
|
||||
local bin=$1
|
||||
local hf=$2
|
||||
local tmpl=${3:-""} # default to empty string if not provided
|
||||
arr_bin+=("$bin")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
}
|
||||
|
||||
add_test "llama-gemma3-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
add_test "llama-llava-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
|
||||
add_test "llama-llava-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-llava-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K"
|
||||
add_test "llama-llava-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
|
||||
add_test "llama-llava-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
|
||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" # this model has broken chat template, not usable
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target "${arr_bin[@]}"
|
||||
@@ -46,12 +50,20 @@ arr_res=()
|
||||
for i in "${!arr_bin[@]}"; do
|
||||
bin="${arr_bin[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/test-1.jpeg \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
|
||||
+16
-5
@@ -865,9 +865,22 @@ int main(int argc, char ** argv) {
|
||||
console::set_display(console::reset);
|
||||
display = true;
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
if (buffer.empty()) { // Ctrl+D on empty line exits
|
||||
LOG("EOF by user\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (buffer.back() == '\n') {
|
||||
// Implement #587:
|
||||
// If the user wants the text to end in a newline,
|
||||
// this should be accomplished by explicitly adding a newline by using \ followed by return,
|
||||
// then returning control by pressing return again.
|
||||
buffer.pop_back();
|
||||
}
|
||||
|
||||
if (buffer.empty()) { // Enter key on empty line lets the user pass control back
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
} else { // Add tokens to embd only if the input buffer is non-empty
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty() && !params.conversation_mode) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
@@ -915,8 +928,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
#include <algorithm>
|
||||
|
||||
struct quant_option {
|
||||
std::string name;
|
||||
@@ -16,7 +17,7 @@ struct quant_option {
|
||||
std::string desc;
|
||||
};
|
||||
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
static const std::vector<quant_option> QUANT_OPTIONS = {
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||
@@ -105,7 +106,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
|
||||
printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
@@ -114,6 +116,8 @@ static void usage(const char * executable) {
|
||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
|
||||
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||
@@ -244,6 +248,107 @@ static ggml_type parse_ggml_type(const char * arg) {
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
// Allowed tensors for arbitrary quantization with --tensor-type option
|
||||
static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
|
||||
"attn_k",
|
||||
"attn_kv_a_mqa",
|
||||
"attn_kv_b",
|
||||
"attn_o",
|
||||
"attn_output",
|
||||
"attn_q",
|
||||
"attn_q_a",
|
||||
"attn_q_b",
|
||||
"attn_qkv",
|
||||
"attn_v",
|
||||
"channel_mix_key",
|
||||
"channel_mix_receptance",
|
||||
"channel_mix_value",
|
||||
"cls",
|
||||
"cls.output",
|
||||
"cross_attn_k",
|
||||
"cross_attn_o",
|
||||
"cross_attn_q",
|
||||
"cross_attn_v",
|
||||
"ffn_act",
|
||||
"ffn_down",
|
||||
"ffn_down_exps",
|
||||
"ffn_down_shexp",
|
||||
"ffn_gate",
|
||||
"ffn_gate_exps",
|
||||
"ffn_gate_shexp",
|
||||
"ffn_up",
|
||||
"ffn_up_exps",
|
||||
"ffn_up_shexp",
|
||||
"ssm_in",
|
||||
"ssm_out",
|
||||
"time_mix_gate",
|
||||
"time_mix_key",
|
||||
"time_mix_output",
|
||||
"time_mix_receptance",
|
||||
"time_mix_value",
|
||||
};
|
||||
|
||||
// changes to this struct must be replicated in llama-quant.cpp
|
||||
struct tensor_quantization {
|
||||
std::string name;
|
||||
ggml_type quant = GGML_TYPE_COUNT;
|
||||
};
|
||||
|
||||
static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
|
||||
const char * sep = strchr(data, '=');
|
||||
if (sep == nullptr) {
|
||||
printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t tn_len = sep - data;
|
||||
if (tn_len == 0) {
|
||||
printf("\n%s: missing tensor name\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (const size_t qt_len = strlen(sep); qt_len == 1) {
|
||||
printf("\n%s: missing quantization type\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string tn(data, tn_len);
|
||||
std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
|
||||
sep++;
|
||||
const std::string qt(sep);
|
||||
|
||||
bool found = false;
|
||||
for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
|
||||
std::string tensor;
|
||||
tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
|
||||
// handle special case of cls.output
|
||||
std::string cls_output = "cls.output";
|
||||
if (tn.find(cls_output) != std::string::npos) {
|
||||
tensor = "cls.output";
|
||||
}
|
||||
// check if an allowed tensor exists and it's at the end of the kv string
|
||||
if (tensor == allowed) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
|
||||
printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
tensor_quantization tqz;
|
||||
tqz.name = tn;
|
||||
tqz.quant = parse_ggml_type(qt.c_str());
|
||||
tensor_type.emplace_back(std::move(tqz));
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
@@ -255,6 +360,7 @@ int main(int argc, char ** argv) {
|
||||
std::string imatrix_file;
|
||||
std::vector<std::string> included_weights, excluded_weights;
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<tensor_quantization> tensor_types;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
@@ -277,6 +383,10 @@ int main(int argc, char ** argv) {
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
||||
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
usage(argv[0]);
|
||||
@@ -361,6 +471,9 @@ int main(int argc, char ** argv) {
|
||||
kv_overrides.back().key[0] = 0;
|
||||
params.kv_overrides = &kv_overrides;
|
||||
}
|
||||
if (!tensor_types.empty()) {
|
||||
params.tensor_types = &tensor_types;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ static std::string fs_get_cache_directory() {
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
@@ -136,7 +136,9 @@ static std::string fs_get_cache_directory() {
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
#else
|
||||
# error Unknown architecture
|
||||
#endif
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
}
|
||||
@@ -295,7 +297,10 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
cache_dir = cache_dir_str.c_str();
|
||||
}
|
||||
printf("Starting RPC server\n");
|
||||
printf("Starting RPC server v%d.%d.%d\n",
|
||||
RPC_PROTO_MAJOR_VERSION,
|
||||
RPC_PROTO_MINOR_VERSION,
|
||||
RPC_PROTO_PATCH_VERSION);
|
||||
printf(" endpoint : %s\n", endpoint.c_str());
|
||||
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
|
||||
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
|
||||
|
||||
@@ -697,8 +697,10 @@ class LlamaData {
|
||||
std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
|
||||
std::string url;
|
||||
|
||||
std::string model_endpoint = get_model_endpoint();
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
|
||||
auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/");
|
||||
hfr = model_name;
|
||||
|
||||
nlohmann::json manifest;
|
||||
@@ -713,7 +715,7 @@ class LlamaData {
|
||||
hff = model.substr(pos + 1);
|
||||
}
|
||||
|
||||
url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
|
||||
url = model_endpoint + hfr + "/resolve/main/" + hff;
|
||||
|
||||
return download(url, bn, true, headers);
|
||||
}
|
||||
|
||||
Binary file not shown.
+168
-116
@@ -1552,29 +1552,30 @@ struct server_queue {
|
||||
std::condition_variable condition_tasks;
|
||||
|
||||
// callback functions
|
||||
std::function<void(server_task)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
std::function<void(server_task &&)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
|
||||
// Add a new task to the end of the queue
|
||||
int post(server_task task, bool front = false) {
|
||||
int post(server_task && task, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
GGML_ASSERT(task.id != -1);
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
|
||||
const int task_id = task.id;
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
return task_id;
|
||||
}
|
||||
|
||||
// multi-task version of post()
|
||||
int post(std::vector<server_task> & tasks, bool front = false) {
|
||||
int post(std::vector<server_task> && tasks, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : tasks) {
|
||||
if (task.id == -1) {
|
||||
@@ -1596,7 +1597,7 @@ struct server_queue {
|
||||
}
|
||||
|
||||
// Add a new task, but defer until one slot is available
|
||||
void defer(server_task task) {
|
||||
void defer(server_task && task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
QUE_DBG("defer task, id = %d\n", task.id);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
@@ -1611,7 +1612,7 @@ struct server_queue {
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task)> callback) {
|
||||
void on_new_task(std::function<void(server_task &&)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
@@ -1660,7 +1661,7 @@ struct server_queue {
|
||||
lock.unlock();
|
||||
break;
|
||||
}
|
||||
server_task task = queue_tasks.front();
|
||||
server_task task = std::move(queue_tasks.front());
|
||||
queue_tasks.pop_front();
|
||||
lock.unlock();
|
||||
|
||||
@@ -1705,6 +1706,8 @@ private:
|
||||
};
|
||||
|
||||
struct server_response {
|
||||
bool running = true;
|
||||
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::unordered_set<int> waiting_task_ids;
|
||||
|
||||
@@ -1759,6 +1762,10 @@ struct server_response {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
@@ -1789,6 +1796,10 @@ struct server_response {
|
||||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1818,6 +1829,12 @@ struct server_response {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// terminate the waiting loop
|
||||
void terminate() {
|
||||
running = false;
|
||||
condition_results.notify_all();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_context {
|
||||
@@ -1988,7 +2005,7 @@ struct server_context {
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
slots.push_back(std::move(slot));
|
||||
}
|
||||
|
||||
default_generation_settings_for_props = slots[0].to_json();
|
||||
@@ -2089,7 +2106,7 @@ struct server_context {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
||||
slot.reset();
|
||||
slot.id_task = task.id;
|
||||
slot.index = task.index;
|
||||
@@ -2097,10 +2114,10 @@ struct server_context {
|
||||
slot.params = std::move(task.params);
|
||||
slot.prompt_tokens = std::move(task.prompt_tokens);
|
||||
|
||||
if (!are_lora_equal(task.params.lora, slot.lora)) {
|
||||
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
||||
// if lora is changed, we cannot reuse cached tokens
|
||||
slot.cache_tokens.clear();
|
||||
slot.lora = task.params.lora;
|
||||
slot.lora = slot.params.lora;
|
||||
}
|
||||
|
||||
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
||||
@@ -2531,10 +2548,10 @@ struct server_context {
|
||||
server_task task(SERVER_TASK_TYPE_CANCEL);
|
||||
task.id_target = id_task;
|
||||
queue_results.remove_waiting_task_id(id_task);
|
||||
cancel_tasks.push_back(task);
|
||||
cancel_tasks.push_back(std::move(task));
|
||||
}
|
||||
// push to beginning of the queue, so it has highest priority
|
||||
queue_tasks.post(cancel_tasks, true);
|
||||
queue_tasks.post(std::move(cancel_tasks), true);
|
||||
}
|
||||
|
||||
// receive the results from task(s)
|
||||
@@ -2621,7 +2638,7 @@ struct server_context {
|
||||
// Functions to process the task
|
||||
//
|
||||
|
||||
void process_single_task(server_task task) {
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
@@ -2635,17 +2652,17 @@ struct server_context {
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (!launch_slot_with_task(*slot, task)) {
|
||||
if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
||||
break;
|
||||
}
|
||||
@@ -2724,7 +2741,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2760,7 +2777,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2803,7 +2820,7 @@ struct server_context {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
queue_tasks.defer(task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2855,7 +2872,7 @@ struct server_context {
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
|
||||
task.id = queue_tasks.get_new_id();
|
||||
queue_tasks.post(task);
|
||||
queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// apply context-shift if needed
|
||||
@@ -3617,14 +3634,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// request slots data using task queue
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = task_id;
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3653,16 +3673,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// request slots data using task queue
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.metrics_reset_bucket = true;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_METRICS);
|
||||
task.id = task_id;
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3759,17 +3780,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3788,17 +3812,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
task.slot_action.filename = filename;
|
||||
task.slot_action.filepath = filepath;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3810,15 +3837,18 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.slot_action.slot_id = id_slot;
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
|
||||
task.id = task_id;
|
||||
task.slot_action.slot_id = id_slot;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -3891,6 +3921,21 @@ int main(int argc, char ** argv) {
|
||||
res_ok(res, {{ "success", true }});
|
||||
};
|
||||
|
||||
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
json data = {
|
||||
{
|
||||
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
|
||||
},
|
||||
{
|
||||
"model_info", {
|
||||
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
};
|
||||
|
||||
// handle completion-like requests (completion, chat, infill)
|
||||
// we can optionally provide a custom format for partial results and final results
|
||||
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
||||
@@ -3907,9 +3952,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
auto completion_id = gen_chatcmplid();
|
||||
std::vector<server_task> tasks;
|
||||
|
||||
std::unordered_set<int> task_ids;
|
||||
try {
|
||||
std::vector<server_task> tasks;
|
||||
|
||||
const auto & prompt = data.at("prompt");
|
||||
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
||||
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
||||
@@ -3924,9 +3970,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
ctx_server.ctx,
|
||||
ctx_server.params_base,
|
||||
data);
|
||||
ctx_server.ctx,
|
||||
ctx_server.params_base,
|
||||
data);
|
||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
@@ -3934,18 +3980,18 @@ int main(int argc, char ** argv) {
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
} catch (const std::exception & e) {
|
||||
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
bool stream = json_value(data, "stream", false);
|
||||
const auto task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
if (!stream) {
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
@@ -4237,6 +4283,7 @@ int main(int argc, char ** argv) {
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
||||
@@ -4249,28 +4296,27 @@ int main(int argc, char ** argv) {
|
||||
// OAI-compat
|
||||
task.params.oaicompat = oaicompat;
|
||||
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
// get the result
|
||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
}
|
||||
|
||||
// get the result
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
@@ -4336,6 +4382,7 @@ int main(int argc, char ** argv) {
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
||||
@@ -4345,26 +4392,24 @@ int main(int argc, char ** argv) {
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
||||
tasks.push_back(task);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
task_ids = server_task::get_list_id(tasks);
|
||||
ctx_server.queue_results.add_waiting_tasks(tasks);
|
||||
ctx_server.queue_tasks.post(tasks);
|
||||
|
||||
// get the result
|
||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
ctx_server.queue_tasks.post(std::move(tasks));
|
||||
}
|
||||
|
||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||
for (auto & res : results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
||||
responses.push_back(res->to_json());
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
@@ -4400,14 +4445,19 @@ int main(int argc, char ** argv) {
|
||||
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||
int task_id = ctx_server.queue_tasks.get_new_id();
|
||||
{
|
||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||
task.id = task_id;
|
||||
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||
ctx_server.queue_results.add_waiting_task_id(task_id);
|
||||
ctx_server.queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
// get the result
|
||||
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
|
||||
ctx_server.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (result->is_error()) {
|
||||
res_error(res, result->to_json());
|
||||
@@ -4455,6 +4505,7 @@ int main(int argc, char ** argv) {
|
||||
svr->Get ("/metrics", handle_metrics);
|
||||
svr->Get ("/props", handle_props);
|
||||
svr->Post("/props", handle_props_change);
|
||||
svr->Post("/api/show", handle_api_show);
|
||||
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Post("/completion", handle_completions); // legacy
|
||||
@@ -4491,9 +4542,10 @@ int main(int argc, char ** argv) {
|
||||
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
||||
|
||||
// clean up function, to be called before exit
|
||||
auto clean_up = [&svr]() {
|
||||
auto clean_up = [&svr, &ctx_server]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
svr->stop();
|
||||
ctx_server.queue_results.terminate();
|
||||
llama_backend_free();
|
||||
};
|
||||
|
||||
@@ -4534,7 +4586,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!ctx_server.load_model(params)) {
|
||||
clean_up();
|
||||
// t.join(); // FIXME: see below
|
||||
t.join();
|
||||
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -4549,8 +4601,8 @@ int main(int argc, char ** argv) {
|
||||
common_chat_templates_source(ctx_server.chat_templates.get()),
|
||||
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
||||
ctx_server.process_single_task(task);
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
|
||||
ctx_server.process_single_task(std::move(task));
|
||||
});
|
||||
|
||||
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
||||
@@ -4582,7 +4634,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
clean_up();
|
||||
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
|
||||
t.join();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -49,6 +49,26 @@ def test_embedding_multiple():
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_multiple_with_fa():
|
||||
server = ServerPreset.bert_bge_small_with_fa()
|
||||
server.pooling = 'last'
|
||||
server.start()
|
||||
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": [
|
||||
"a "*253,
|
||||
"b "*254,
|
||||
"c "*255,
|
||||
"d "*256,
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 4
|
||||
for d in res.body['data']:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input,is_multi_prompt",
|
||||
[
|
||||
|
||||
@@ -323,6 +323,21 @@ class ServerPreset:
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
server.n_ctx = 1024
|
||||
server.n_batch = 300
|
||||
server.n_ubatch = 300
|
||||
server.n_slots = 2
|
||||
server.fa = True
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "common/base64.hpp"
|
||||
#include "base64.hpp"
|
||||
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { useEffect, useMemo, useRef, useState } from 'react';
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
|
||||
import ChatMessage from './ChatMessage';
|
||||
import { CanvasType, Message, PendingMessage } from '../utils/types';
|
||||
@@ -6,6 +6,7 @@ import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
|
||||
import CanvasPyInterpreter from './CanvasPyInterpreter';
|
||||
import StorageUtils from '../utils/storage';
|
||||
import { useVSCodeContext } from '../utils/llama-vscode';
|
||||
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
|
||||
|
||||
/**
|
||||
* A message display is a message node with additional information for rendering.
|
||||
@@ -99,7 +100,8 @@ export default function ChatScreen() {
|
||||
canvasData,
|
||||
replaceMessageAndGenerate,
|
||||
} = useAppContext();
|
||||
const textarea = useOptimizedTextarea(prefilledMsg.content());
|
||||
|
||||
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
|
||||
|
||||
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
|
||||
// TODO: improve this when we have "upload file" feature
|
||||
@@ -248,14 +250,16 @@ export default function ChatScreen() {
|
||||
</div>
|
||||
|
||||
{/* chat input */}
|
||||
<div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
|
||||
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
|
||||
<textarea
|
||||
className="textarea textarea-bordered w-full"
|
||||
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
|
||||
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
|
||||
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
|
||||
placeholder="Type a message (Shift+Enter to add a new line)"
|
||||
ref={textarea.ref}
|
||||
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
|
||||
onKeyDown={(e) => {
|
||||
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
|
||||
if (e.key === 'Enter' && e.shiftKey) return;
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
sendNewMessage();
|
||||
@@ -263,7 +267,11 @@ export default function ChatScreen() {
|
||||
}}
|
||||
id="msg-input"
|
||||
dir="auto"
|
||||
// Set a base height of 2 rows for mobile views
|
||||
// On lg+ screens, the hook will calculate and set the initial height anyway
|
||||
rows={2}
|
||||
></textarea>
|
||||
|
||||
{isGenerating(currConvId ?? '') ? (
|
||||
<button
|
||||
className="btn btn-neutral ml-2"
|
||||
@@ -286,43 +294,3 @@ export default function ChatScreen() {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export interface OptimizedTextareaValue {
|
||||
value: () => string;
|
||||
setValue: (value: string) => void;
|
||||
focus: () => void;
|
||||
ref: React.RefObject<HTMLTextAreaElement>;
|
||||
}
|
||||
|
||||
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
|
||||
// See https://github.com/ggml-org/llama.cpp/pull/12299
|
||||
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
|
||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
if (textareaRef.current && savedInitValue) {
|
||||
textareaRef.current.value = savedInitValue;
|
||||
setSavedInitValue('');
|
||||
}
|
||||
}, [textareaRef, savedInitValue, setSavedInitValue]);
|
||||
|
||||
return {
|
||||
value: () => {
|
||||
return textareaRef.current?.value ?? savedInitValue;
|
||||
},
|
||||
setValue: (value: string) => {
|
||||
if (textareaRef.current) {
|
||||
textareaRef.current.value = value;
|
||||
}
|
||||
},
|
||||
focus: () => {
|
||||
if (textareaRef.current) {
|
||||
// focus and move the cursor to the end
|
||||
textareaRef.current.focus();
|
||||
textareaRef.current.selectionStart = textareaRef.current.value.length;
|
||||
}
|
||||
},
|
||||
ref: textareaRef,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
import { useEffect, useRef, useState, useCallback } from 'react';
|
||||
|
||||
// Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
|
||||
const LARGE_SCREEN_MQ = '(min-width: 1024px)';
|
||||
|
||||
// Calculates and sets the textarea height based on its scrollHeight
|
||||
const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
|
||||
if (!textarea) return;
|
||||
|
||||
// Only perform auto-sizing on large screens
|
||||
if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
|
||||
// On small screens, reset inline height and max-height styles.
|
||||
// This allows CSS (e.g., `rows` attribute or classes) to control the height,
|
||||
// and enables manual resizing if `resize-vertical` is set.
|
||||
textarea.style.height = ''; // Use 'auto' or '' to reset
|
||||
textarea.style.maxHeight = '';
|
||||
return; // Do not adjust height programmatically on small screens
|
||||
}
|
||||
|
||||
const computedStyle = window.getComputedStyle(textarea);
|
||||
// Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
|
||||
const currentMaxHeight = computedStyle.maxHeight;
|
||||
|
||||
// Temporarily remove max-height to allow scrollHeight to be calculated correctly
|
||||
textarea.style.maxHeight = 'none';
|
||||
// Reset height to 'auto' to measure the actual scrollHeight needed
|
||||
textarea.style.height = 'auto';
|
||||
// Set the height to the calculated scrollHeight
|
||||
textarea.style.height = `${textarea.scrollHeight}px`;
|
||||
// Re-apply the original max-height from CSS to enforce the limit
|
||||
textarea.style.maxHeight = currentMaxHeight;
|
||||
};
|
||||
|
||||
// Interface describing the API returned by the hook
|
||||
export interface ChatTextareaApi {
|
||||
value: () => string;
|
||||
setValue: (value: string) => void;
|
||||
focus: () => void;
|
||||
ref: React.RefObject<HTMLTextAreaElement>;
|
||||
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
|
||||
}
|
||||
|
||||
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
|
||||
// See https://github.com/ggml-org/llama.cpp/pull/12299
|
||||
// combined now with auto-sizing logic.
|
||||
export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
// Effect to set initial value and height on mount or when initValue changes
|
||||
useEffect(() => {
|
||||
const textarea = textareaRef.current;
|
||||
if (textarea) {
|
||||
if (typeof savedInitValue === 'string' && savedInitValue.length > 0) {
|
||||
textarea.value = savedInitValue;
|
||||
// Call adjustTextareaHeight - it will check screen size internally
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
setSavedInitValue(''); // Reset after applying
|
||||
} else {
|
||||
// Adjust height even if there's no initial value (for initial render)
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
}
|
||||
}
|
||||
}, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
|
||||
|
||||
const handleInput = useCallback(
|
||||
(event: React.FormEvent<HTMLTextAreaElement>) => {
|
||||
// Call adjustTextareaHeight on every input - it will decide whether to act
|
||||
adjustTextareaHeight(event.currentTarget);
|
||||
},
|
||||
[]
|
||||
);
|
||||
|
||||
return {
|
||||
// Method to get the current value directly from the textarea
|
||||
value: () => {
|
||||
return textareaRef.current?.value ?? '';
|
||||
},
|
||||
// Method to programmatically set the value and trigger height adjustment
|
||||
setValue: (value: string) => {
|
||||
const textarea = textareaRef.current;
|
||||
if (textarea) {
|
||||
textarea.value = value;
|
||||
// Call adjustTextareaHeight - it will check screen size internally
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
}
|
||||
},
|
||||
focus: () => {
|
||||
if (textareaRef.current) {
|
||||
textareaRef.current.focus();
|
||||
}
|
||||
},
|
||||
ref: textareaRef,
|
||||
onInput: handleInput,
|
||||
};
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { MessageExtraContext } from './types';
|
||||
import { OptimizedTextareaValue } from '../components/ChatScreen';
|
||||
import { ChatTextareaApi } from '../components/useChatTextarea.ts';
|
||||
|
||||
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
|
||||
@@ -15,7 +15,7 @@ interface SetTextEvData {
|
||||
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
|
||||
*/
|
||||
|
||||
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
|
||||
export const useVSCodeContext = (textarea: ChatTextareaApi) => {
|
||||
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
|
||||
null
|
||||
);
|
||||
|
||||
@@ -15,7 +15,7 @@ async def main():
|
||||
model_url = "http://127.0.0.1:6900"
|
||||
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||
url= f"{model_url}/embedding",
|
||||
json= {"content": str(0)*1024}
|
||||
json= {"content": "a "*1022}
|
||||
) for i in range(n)])
|
||||
|
||||
for response in responses:
|
||||
|
||||
@@ -8,10 +8,10 @@ cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
|
||||
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
+1
-1
@@ -107,6 +107,7 @@ message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
@@ -170,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 1
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
|
||||
+38
-91
@@ -507,17 +507,12 @@ extern "C" {
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_MAP_CUSTOM1_F32,
|
||||
GGML_OP_MAP_CUSTOM2_F32,
|
||||
GGML_OP_MAP_CUSTOM3_F32,
|
||||
|
||||
GGML_OP_MAP_CUSTOM1,
|
||||
GGML_OP_MAP_CUSTOM2,
|
||||
GGML_OP_MAP_CUSTOM3,
|
||||
|
||||
GGML_OP_CUSTOM,
|
||||
|
||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
GGML_OP_OPT_STEP_ADAMW,
|
||||
@@ -1722,24 +1717,29 @@ extern "C" {
|
||||
float p0,
|
||||
float p1);
|
||||
|
||||
// nearest interpolate
|
||||
enum ggml_scale_mode {
|
||||
GGML_SCALE_MODE_NEAREST = 0,
|
||||
GGML_SCALE_MODE_BILINEAR = 1,
|
||||
};
|
||||
|
||||
// interpolate
|
||||
// multiplies ne0 and ne1 by scale factor
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
int scale_factor,
|
||||
enum ggml_scale_mode mode);
|
||||
|
||||
// nearest interpolate
|
||||
// nearest interpolate to specified dimensions
|
||||
// used in tortoise.cpp
|
||||
// interpolate
|
||||
// interpolate scale to specified dimensions
|
||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
int ne3,
|
||||
enum ggml_scale_mode mode);
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
@@ -1916,83 +1916,6 @@ extern "C" {
|
||||
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
||||
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
ggml_unary_op_f32_t fun),
|
||||
"use ggml_map_custom1 instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
ggml_unary_op_f32_t fun),
|
||||
"use ggml_map_custom1_inplace instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
ggml_binary_op_f32_t fun),
|
||||
"use ggml_map_custom2 instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
ggml_binary_op_f32_t fun),
|
||||
"use ggml_map_custom2_inplace instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
ggml_custom1_op_f32_t fun),
|
||||
"use ggml_map_custom1 instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
ggml_custom1_op_f32_t fun),
|
||||
"use ggml_map_custom1_inplace instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
ggml_custom2_op_f32_t fun),
|
||||
"use ggml_map_custom2 instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
ggml_custom2_op_f32_t fun),
|
||||
"use ggml_map_custom2_inplace instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
ggml_custom3_op_f32_t fun),
|
||||
"use ggml_map_custom3 instead");
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
ggml_custom3_op_f32_t fun),
|
||||
"use ggml_map_custom3_inplace instead");
|
||||
|
||||
// custom operators v2
|
||||
|
||||
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||
@@ -2048,6 +1971,30 @@ extern "C" {
|
||||
int n_tasks,
|
||||
void * userdata);
|
||||
|
||||
typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_custom_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3,
|
||||
struct ggml_tensor ** args,
|
||||
int n_args,
|
||||
ggml_custom_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor ** args,
|
||||
int n_args,
|
||||
ggml_custom_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata);
|
||||
|
||||
// loss function
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
||||
|
||||
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(sandybridge AVX)
|
||||
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -41,6 +41,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
||||
return ACL_INT4;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return ACL_INT8;
|
||||
case GGML_TYPE_I64:
|
||||
return ACL_INT64;
|
||||
default:
|
||||
return ACL_DT_UNDEFINED;
|
||||
}
|
||||
|
||||
+364
-362
File diff suppressed because it is too large
Load Diff
+407
-59
@@ -1,15 +1,4 @@
|
||||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
/**
|
||||
* @file acl_tensor
|
||||
* @brief This file contains related functions of ggml_tensor and acl_tensor.
|
||||
* Contains conversion from ggml_tensor to acl_tensor, broadcast and other
|
||||
* functions.
|
||||
* @author hipudding <huafengchun@gmail.com>
|
||||
* @author wangshuai09 <391746016@qq.com>
|
||||
* @date July 15, 2024
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
@@ -31,6 +20,10 @@
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
#include <functional>
|
||||
#include <aclnnop/aclnn_abs.h>
|
||||
#include <aclnnop/aclnn_neg.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
@@ -50,6 +43,8 @@
|
||||
#include <aclnnop/aclnn_sqrt.h>
|
||||
#include <aclnnop/aclnn_sin.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_log.h>
|
||||
#include <aclnnop/aclnn_sign.h>
|
||||
#include "acl_tensor.h"
|
||||
#include "common.h"
|
||||
|
||||
@@ -483,8 +478,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
* operation is executed using the CANN backend for optimized performance.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the indices of the maximum values will be stored.
|
||||
* dst->op is `GGML_OP_ARGMAX`.
|
||||
* @param dst The destination tensor where the indices of the maximum values will
|
||||
* be stored. dst->op is `GGML_OP_ARGMAX`.
|
||||
*/
|
||||
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
@@ -599,6 +594,286 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_dst);
|
||||
|
||||
/**
|
||||
* @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
|
||||
* output tensor.
|
||||
*
|
||||
* This function checks whether broadcasting is needed between `src0` and `src1`.
|
||||
* If broadcasting is required, it calculates the proper shapes and creates
|
||||
* ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
|
||||
* based on the original tensor shapes.
|
||||
*
|
||||
* @param src0 The first input tensor (reference shape).
|
||||
* @param src1 The second input tensor (possibly broadcasted).
|
||||
* @param dst The destination/output tensor.
|
||||
* @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
|
||||
* @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
|
||||
* @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
|
||||
*/
|
||||
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
|
||||
aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the 1D transposed convolution (deconvolution) of a ggml
|
||||
* tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs a 1D transposed convolution (also known as
|
||||
* deconvolution) operation on the input tensor. The computed result is stored
|
||||
* in the destination tensor `dst`. The operation is optimized using the CANN
|
||||
* backend for improved performance.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the transposed convolution result
|
||||
* will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
|
||||
*/
|
||||
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
|
||||
* using the CANN backend.
|
||||
*
|
||||
* @details This function performs an element-wise ELU activation on the input
|
||||
* tensor.
|
||||
* The result is written to the destination tensor `dst` in-place.
|
||||
* The ELU function is defined as:
|
||||
*
|
||||
* \text{ELU}(x) =
|
||||
* \begin{cases}
|
||||
* x, & \text{if } x > 0 \\
|
||||
* \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
|
||||
* \end{cases}
|
||||
*
|
||||
* where α (alpha) is a hyperparameter, typically set to 1.0.
|
||||
* This operation is optimized using the CANN backend for high-performance
|
||||
* inference or training.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the ELU-activated result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_ELU`.
|
||||
*/
|
||||
void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the mean of a ggml tensor element-wise using the CANN backend.
|
||||
*
|
||||
* @details This function calculates the element-wise mean of the input tensor.
|
||||
* The result is written to the destination tensor `dst`.
|
||||
* The mean is computed by averaging the values across the entire tensor.
|
||||
*
|
||||
* This operation is optimized using the CANN backend for high-performance inference or training.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the mean result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_MEAN`.
|
||||
*/
|
||||
void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies 1D reflect padding to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs 1D reflect padding on the input tensor.
|
||||
* The amount of padding on each side is specified by parameters stored in `dst->op_params`.
|
||||
* The operation reflects the values at the borders of the tensor to generate the padded output.
|
||||
*
|
||||
* This operation is optimized using the CANN backend for high-performance inference or training.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the padded result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`.
|
||||
*/
|
||||
void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Counts the number of equal elements in two ggml tensors using the CANN backend.
|
||||
*
|
||||
* @details This function performs an element-wise comparison between two input tensors,
|
||||
* and counts the number of positions where the elements are equal. The result is
|
||||
* stored in the destination tensor `dst` as a scalar.
|
||||
*
|
||||
* The operation is optimized using the CANN backend, making it suitable for
|
||||
* high-performance inference or training scenarios.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_COUNT_EQUAL`.
|
||||
*/
|
||||
void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies the Step activation function to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function applies a step function element-wise to the input tensor, where
|
||||
* each element is transformed to 1.0 if it is greater than 0, and 0.0 otherwise.
|
||||
* The result is stored in the destination tensor `dst`.
|
||||
*
|
||||
* This operation is accelerated using the CANN backend to improve runtime performance.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_STEP`.
|
||||
*/
|
||||
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/*
|
||||
* @brief A generic wrapper for ACL resources with custom deleter support.
|
||||
*/
|
||||
using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
|
||||
|
||||
/**
|
||||
* @brief Trait structure used to define how to destroy a given ACL resource type.
|
||||
*
|
||||
* @tparam T ACL resource type.
|
||||
*/
|
||||
template<typename T>
|
||||
struct acl_resource_traits;
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclTensor> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclIntArray> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclScalar> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
|
||||
*/
|
||||
template<>
|
||||
struct acl_resource_traits<aclTensorList> {
|
||||
static void destroy(void* p) {
|
||||
ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Creates a generic ACL resource wrapper with proper destruction logic.
|
||||
*
|
||||
* @tparam T ACL resource type.
|
||||
* @param ptr Raw pointer to ACL resource.
|
||||
* @return any_acl_resource Smart pointer that handles destruction.
|
||||
*/
|
||||
template<typename T>
|
||||
any_acl_resource make_acl_resource(T* ptr) {
|
||||
return any_acl_resource(
|
||||
static_cast<void*>(ptr),
|
||||
[](void* p) {
|
||||
acl_resource_traits<T>::destroy(p);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Registers multiple ACL resources into a vector for lifetime management.
|
||||
*
|
||||
* @tparam Args Variadic list of ACL resource types.
|
||||
* @param vec Target vector to hold ACL resources.
|
||||
* @param args Raw pointers to ACL resources.
|
||||
*/
|
||||
template<typename... Args>
|
||||
void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
|
||||
(vec.emplace_back(make_acl_resource(args)), ...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Task class that wraps the execution of an aclnn function call.
|
||||
*/
|
||||
class aclnn_task : public cann_task {
|
||||
public:
|
||||
aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
|
||||
uint64_t workspace_size, aclOpExecutor * executor,
|
||||
aclrtStream stream) :
|
||||
aclnn_func_(aclnn_func),
|
||||
workspace_addr_(workspace_addr),
|
||||
workspace_size_(workspace_size),
|
||||
executor_(executor),
|
||||
stream_(stream) {}
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
|
||||
}
|
||||
private:
|
||||
aclnn_func_t aclnn_func_;
|
||||
void * workspace_addr_;
|
||||
uint64_t workspace_size_;
|
||||
aclOpExecutor * executor_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class that releases ACL resources after usage.
|
||||
*/
|
||||
class release_resource_task : public cann_task {
|
||||
public:
|
||||
release_resource_task(std::vector<any_acl_resource>&& resources){
|
||||
resource_ = std::move(resources);
|
||||
}
|
||||
|
||||
virtual void run_task() override {
|
||||
resource_.clear();
|
||||
}
|
||||
private:
|
||||
std::vector<any_acl_resource> resource_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class for performing asynchronous memory copy operations.
|
||||
*/
|
||||
class async_memcpy_task : public cann_task {
|
||||
public:
|
||||
async_memcpy_task(void* dst, const void* src, size_t size,
|
||||
aclrtMemcpyKind kind, aclrtStream stream)
|
||||
: dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
|
||||
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
|
||||
}
|
||||
private:
|
||||
void* dst_;
|
||||
const void* src_;
|
||||
size_t size_;
|
||||
aclrtMemcpyKind kind_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Task class for performing asynchronous memory set operations.
|
||||
*/
|
||||
class async_memset_task : public cann_task {
|
||||
public:
|
||||
async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
|
||||
: buffer_(buffer), size_(size), value_(value), stream_(stream) {}
|
||||
|
||||
virtual void run_task() override {
|
||||
ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
|
||||
}
|
||||
private:
|
||||
void* buffer_;
|
||||
size_t size_;
|
||||
int32_t value_;
|
||||
aclrtStream stream_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Launches an asynchronous task using the memory allocator.
|
||||
*
|
||||
@@ -616,45 +891,101 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
* other task before this asynchronous task ends, because all tasks in the
|
||||
* same stream are executed in queue order.
|
||||
*/
|
||||
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
|
||||
do { \
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor * executor; \
|
||||
void * workspaceAddr = nullptr; \
|
||||
\
|
||||
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
|
||||
\
|
||||
if (workspaceSize > 0) { \
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
|
||||
workspaceAddr = workspace_allocator.get(); \
|
||||
} \
|
||||
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
|
||||
|
||||
#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
|
||||
do { \
|
||||
uint64_t workspaceSize = 0; \
|
||||
aclOpExecutor * executor; \
|
||||
void * workspaceAddr = nullptr; \
|
||||
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
|
||||
/* workspace should alloced in main thread to keep malloc order when using vmm. */ \
|
||||
if (workspaceSize > 0) { \
|
||||
ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
|
||||
workspaceAddr = workspace_allocator.get(); \
|
||||
} \
|
||||
if (CTX.async_mode) { \
|
||||
auto task = \
|
||||
std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
|
||||
executor, CTX.stream()); \
|
||||
CTX.task_queue.submit_task(std::move(task)); \
|
||||
} else { \
|
||||
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
* @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
|
||||
* @brief Registers and releases multiple ACL resources, optionally deferring the release
|
||||
* using a task.
|
||||
*
|
||||
* This function checks whether broadcasting is needed between `src0` and `src1`.
|
||||
* If broadcasting is required, it calculates the proper shapes and creates
|
||||
* ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
|
||||
* based on the original tensor shapes.
|
||||
*
|
||||
* @param src0 The first input tensor (reference shape).
|
||||
* @param src1 The second input tensor (possibly broadcasted).
|
||||
* @param dst The destination/output tensor.
|
||||
* @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
|
||||
* @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
|
||||
* @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
|
||||
* @tparam Args Types of the ACL resources.
|
||||
* @param ctx Backend context which manages task submission and async mode.
|
||||
* @param args Pointers to ACL resources to be released.
|
||||
*/
|
||||
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
|
||||
aclTensor ** acl_src1, aclTensor ** acl_dst);
|
||||
template <typename... Args>
|
||||
void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
|
||||
std::vector<any_acl_resource> resources;
|
||||
register_acl_resources(resources, std::forward<Args>(args)...);
|
||||
if(ctx.async_mode) {
|
||||
auto task = std::make_unique<release_resource_task>(std::move(resources));
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a element-wise operation to two input tensors using the CANN backend.
|
||||
* @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
|
||||
*
|
||||
* This templated function takes a binary operator and applies it to two source tensors
|
||||
* associated with the destination tensor. The function handles broadcasting as needed.
|
||||
* @param ctx Backend context containing stream and async configuration.
|
||||
* @param dst Destination memory address.
|
||||
* @param src Source memory address.
|
||||
* @param len Size of memory to copy (in bytes).
|
||||
* @param kind Type of memory copy (host-to-device, device-to-host, etc).
|
||||
*/
|
||||
inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
|
||||
const void * src, size_t len, aclrtMemcpyKind kind) {
|
||||
if (ctx.async_mode) {
|
||||
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
|
||||
const void * src, size_t len, aclrtMemcpyKind kind) {
|
||||
if (ctx->async_mode) {
|
||||
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
|
||||
ctx->task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
|
||||
*
|
||||
* @param ctx Backend context containing stream and async configuration.
|
||||
* @param buffer Memory buffer to be set.
|
||||
* @param size Size of the memory buffer (in bytes).
|
||||
* @param value Value to set in the buffer.
|
||||
*/
|
||||
inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
|
||||
size_t size, int value) {
|
||||
if (ctx.async_mode) {
|
||||
auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
|
||||
ctx.task_queue.submit_task(std::move(task));
|
||||
} else {
|
||||
ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a element-wise operation to two input tensors using the CANN
|
||||
* backend.
|
||||
*
|
||||
* This templated function takes a binary operator and applies it to two source
|
||||
* tensors
|
||||
* associated with the destination tensor. The function handles broadcasting as
|
||||
* needed.
|
||||
*
|
||||
* @tparam binary_op A callable object (e.g., lambda or function pointer) representing
|
||||
* the binary operation to be performed. It must take three arguments:
|
||||
@@ -676,11 +1007,10 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
|
||||
binary_op(ctx, acl_src0, acl_src1, acl_dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to an input tensor using the CANN backend.
|
||||
*
|
||||
@@ -690,7 +1020,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @tparam unary_op A callable with the signature:
|
||||
* void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
|
||||
* where the first aclTensor is the source and the second is the destination.
|
||||
*
|
||||
* @param ctx The CANN backend context for managing resources and execution.
|
||||
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
|
||||
*/
|
||||
@@ -702,10 +1031,28 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
unary_op(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs a unary operation on the input tensor using
|
||||
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
|
||||
* context and two ACL tensors (source and destination). Internally, this function
|
||||
* creates ACL representations of the ggml tensors and invokes the unary operation.
|
||||
* The result is stored in the destination tensor `dst`. This utility abstracts the
|
||||
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN APIs.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* The source tensor is retrieved from `dst->src[0]`.
|
||||
*/
|
||||
void ggml_cann_unary_op(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
|
||||
*
|
||||
@@ -715,7 +1062,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
*
|
||||
* Internally, the lambda will call:
|
||||
* @code
|
||||
* GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* @endcode
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
@@ -723,13 +1070,14 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
* @see ggml_cann_unary_op
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op<lambda>(ctx, dst); \
|
||||
} \
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
+135
-1
@@ -31,9 +31,16 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
#include <functional>
|
||||
|
||||
#include "../include/ggml-cann.h"
|
||||
#include "../include/ggml.h"
|
||||
#include "../ggml-impl.h"
|
||||
|
||||
#define MATRIX_ROW_PADDING 512
|
||||
#define GGML_CANN_MAX_STREAMS 8
|
||||
@@ -205,6 +212,127 @@ struct ggml_cann_pool_alloc {
|
||||
ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Function pointer type for ACLNN operator calls.
|
||||
*/
|
||||
using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream);
|
||||
|
||||
/**
|
||||
* @brief Base class for all CANN tasks to be submitted to the task queue.
|
||||
*
|
||||
* Users should override the run_task() method with actual task logic.
|
||||
*/
|
||||
class cann_task {
|
||||
public:
|
||||
virtual void run_task() {}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances.
|
||||
*/
|
||||
class cann_task_queue {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructs a task queue with a fixed power-of-two capacity for a specific device.
|
||||
*
|
||||
* @param capacity Queue capacity. Must be a power of 2.
|
||||
* @param device Target device ID (used for context setting).
|
||||
*/
|
||||
explicit cann_task_queue(size_t capacity, int32_t device)
|
||||
: buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
|
||||
running_(false), device_(device) {
|
||||
GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
|
||||
mask_ = capacity_ - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Attempts to enqueue a task into the queue.
|
||||
*
|
||||
* @param item Unique pointer to the task.
|
||||
* @return true if the task was successfully enqueued, false if the queue was full.
|
||||
*/
|
||||
bool enqueue(std::unique_ptr<cann_task>&& item) {
|
||||
size_t next_tail = (tail_ + 1) & mask_;
|
||||
|
||||
if (next_tail == head_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer_[tail_] = std::move(item);
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
tail_ = next_tail;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Submits a task to the queue, and starts the worker thread if not already running.
|
||||
*
|
||||
* @param task Task to be submitted.
|
||||
*/
|
||||
void submit_task(std::unique_ptr<cann_task>&& task) {
|
||||
while(!enqueue(std::move(task))) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!running_) {
|
||||
running_ = true;
|
||||
thread_ = std::thread(&cann_task_queue::execute, this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Waits until the queue is completely empty and no tasks are being processed.
|
||||
*/
|
||||
void wait() {
|
||||
while (running_ && head_ != tail_) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Stops the task queue and joins the worker thread.
|
||||
*/
|
||||
void stop() {
|
||||
running_ = false;
|
||||
if (thread_.joinable()) {
|
||||
thread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Worker thread function that continuously dequeues and executes tasks.
|
||||
*/
|
||||
void execute() {
|
||||
ggml_cann_set_device(device_);
|
||||
|
||||
while (running_) {
|
||||
if(head_ == tail_) {
|
||||
std::this_thread::yield();
|
||||
continue;
|
||||
}
|
||||
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
buffer_[head_]->run_task();
|
||||
buffer_[head_].reset();
|
||||
head_ = (head_ + 1) & mask_;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<cann_task>> buffer_;
|
||||
const size_t capacity_;
|
||||
size_t mask_;
|
||||
size_t head_;
|
||||
size_t tail_;
|
||||
bool running_;
|
||||
std::thread thread_;
|
||||
int32_t device_;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
@@ -213,6 +341,8 @@ struct ggml_backend_cann_context {
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
cann_task_queue task_queue;
|
||||
bool async_mode;
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
@@ -221,9 +351,12 @@ struct ggml_backend_cann_context {
|
||||
* @param device Device ID.
|
||||
*/
|
||||
explicit ggml_backend_cann_context(int device)
|
||||
: device(device), name("CANN" + std::to_string(device)) {
|
||||
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -231,6 +364,7 @@ struct ggml_backend_cann_context {
|
||||
*/
|
||||
~ggml_backend_cann_context() {
|
||||
ggml_cann_set_device(device);
|
||||
task_queue.stop();
|
||||
if (copy_event != nullptr) {
|
||||
ACL_CHECK(aclrtDestroyEvent(copy_event));
|
||||
}
|
||||
|
||||
+419
-101
@@ -29,6 +29,8 @@
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <chrono>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
@@ -119,9 +121,10 @@ static ggml_cann_device_info ggml_cann_init() {
|
||||
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = id;
|
||||
prop.reserve = 0;
|
||||
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
||||
err = aclrtMemGetAllocationGranularity(
|
||||
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
||||
&info.devices[id].vmm_granularity));
|
||||
&info.devices[id].vmm_granularity);
|
||||
info.devices[id].vmm = err == ACL_SUCCESS;
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_cann_get_device_memory(id, &free, &total);
|
||||
@@ -148,11 +151,223 @@ const ggml_cann_device_info& ggml_cann_info() {
|
||||
|
||||
//#define DEBUG_CANN_MALLOC
|
||||
/**
|
||||
* @brief A pool of CANN buffers(legacy).
|
||||
* @brief A pool of CANN buffers(priority segment buffer).
|
||||
*
|
||||
* This class manages a pool of CANN buffers for a specific device.
|
||||
*/
|
||||
struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
||||
/**
|
||||
* @brief The maximum reuse margin for a buffer.
|
||||
*/
|
||||
static const size_t max_reuse_margin = 1ull << 22; // 4MB
|
||||
|
||||
/**
|
||||
* @brief The minimum free margin for a buffer.
|
||||
*/
|
||||
static const size_t min_free_margin = 1ull << 20; // 1MB
|
||||
|
||||
/**
|
||||
* @brief The alignment for buffer allocation.
|
||||
*/
|
||||
static const size_t alignment = 128;
|
||||
|
||||
/**
|
||||
* @brief The device ID associated with this buffer pool.
|
||||
*/
|
||||
int device;
|
||||
|
||||
/**
|
||||
* @brief Whether to disable clean during buffer allocation.
|
||||
*/
|
||||
bool disable_clean = false;
|
||||
|
||||
/**
|
||||
* @brief Structure representing a CANN buffer.
|
||||
*/
|
||||
struct ggml_cann_buffer {
|
||||
void* ptr = nullptr; ///< Pointer to the buffer.
|
||||
size_t size = 0; ///< Size of the buffer.
|
||||
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
||||
|
||||
bool operator>(const ggml_cann_buffer& other) const {
|
||||
return size > other.size;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Array of CANN buffers in the pool.
|
||||
*/
|
||||
std::unordered_map<void*, size_t> buffer_pool;
|
||||
std::priority_queue<ggml_cann_buffer,
|
||||
std::vector<ggml_cann_buffer>,
|
||||
std::greater<>> free_buffers ;
|
||||
|
||||
/**
|
||||
* @brief Total size of all buffers in the pool.
|
||||
*/
|
||||
size_t pool_size = 0;
|
||||
|
||||
/**
|
||||
* @brief Constructor to initialize the buffer pool for a specific device.
|
||||
*
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
||||
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor to free all buffers in the pool.
|
||||
*/
|
||||
~ggml_cann_pool_buf_prio() {
|
||||
ggml_cann_set_device(device);
|
||||
for (auto& [b_ptr, b_size] : buffer_pool) {
|
||||
aclrtFree(b_ptr);
|
||||
pool_size -= b_size;
|
||||
}
|
||||
buffer_pool.clear();
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocate a buffer of the given size.
|
||||
*
|
||||
* @param size The size of the buffer to allocate.
|
||||
* @param actual_size A pointer to a variable to receive the actual size of
|
||||
* the allocated buffer.
|
||||
* @return A pointer to the allocated buffer.
|
||||
*/
|
||||
void* alloc(size_t size, size_t* actual_size) override {
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
|
||||
void* ptr = nullptr;
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
|
||||
std::vector<ggml_cann_buffer> free_buffers_rest;
|
||||
free_buffers_rest.reserve(free_buffers.size());
|
||||
while (!free_buffers.empty()) {
|
||||
auto b = free_buffers.top();
|
||||
free_buffers.pop();
|
||||
|
||||
if (b.size >= size) {
|
||||
// reuse the buffer if the size is enough
|
||||
const size_t margin = b.size - size;
|
||||
if (margin <= max_reuse_margin) {
|
||||
*actual_size = b.size;
|
||||
ptr = b.ptr;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: reused %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB, "
|
||||
"margin = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_clean = !disable_clean &&
|
||||
b.size > min_free_margin &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
||||
if (should_clean) {
|
||||
// free the buffer if the size is needed to be freed
|
||||
ACL_CHECK(aclrtFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
buffer_pool.erase(b.ptr);
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: clean %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
free_buffers_rest.push_back(b);
|
||||
}
|
||||
for (ggml_cann_buffer &b : free_buffers_rest) {
|
||||
free_buffers.push(std::move(b));
|
||||
}
|
||||
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
||||
#endif
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// allocate a new buffer if no buffer can be reused
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = size;
|
||||
pool_size += size;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: allocate %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB\n",
|
||||
device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(size, 1048576) / 1048576));
|
||||
#endif
|
||||
buffer_pool.emplace(ptr, size);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free a buffer and return it to the pool.
|
||||
*
|
||||
* @param ptr Pointer to the buffer to free.
|
||||
* @param size Size of the buffer to free.
|
||||
*/
|
||||
void free(void* ptr, size_t size) override {
|
||||
GGML_UNUSED(size);
|
||||
auto it = buffer_pool.find(ptr);
|
||||
if (it == buffer_pool.end()) {
|
||||
GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
|
||||
}
|
||||
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: return %p, "
|
||||
"pool_size = %5u MB\n",
|
||||
device, ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A pool of CANN buffers(segment buffer).
|
||||
*
|
||||
* This class manages a pool of CANN buffers for a specific device.
|
||||
*/
|
||||
struct ggml_cann_pool_buf : public ggml_cann_pool {
|
||||
/**
|
||||
* @brief The maximum reuse margin for a buffer.
|
||||
*/
|
||||
static const size_t max_reuse_margin = 1ull << 22; // 4MB
|
||||
|
||||
/**
|
||||
* @brief The minimum free margin for a buffer.
|
||||
*/
|
||||
static const size_t min_free_margin = 1ull << 20; // 1MB
|
||||
|
||||
/**
|
||||
* @brief The alignment for buffer allocation.
|
||||
*/
|
||||
static const size_t alignment = 128;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of buffers in the pool.
|
||||
*/
|
||||
@@ -163,12 +378,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
*/
|
||||
int device;
|
||||
|
||||
/**
|
||||
* @brief Whether to disable clean during buffer allocation.
|
||||
*/
|
||||
bool disable_clean = false;
|
||||
|
||||
/**
|
||||
* @brief Structure representing a CANN buffer.
|
||||
*/
|
||||
struct ggml_cann_buffer {
|
||||
void* ptr = nullptr; ///< Pointer to the buffer memory.
|
||||
size_t size = 0; ///< Size of the buffer.
|
||||
bool used = false; ///< Whether the buffer is currently in use.
|
||||
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -186,17 +408,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
*
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_leg(int device) : device(device) {}
|
||||
explicit ggml_cann_pool_buf(int device) : device(device) {
|
||||
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor to free all buffers in the pool.
|
||||
*/
|
||||
~ggml_cann_pool_leg() {
|
||||
~ggml_cann_pool_buf() {
|
||||
ggml_cann_set_device(device);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cann_buffer& b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
ACL_CHECK(aclrtFree(b.ptr));
|
||||
aclrtFree(b.ptr);
|
||||
pool_size -= b.size;
|
||||
}
|
||||
}
|
||||
@@ -212,63 +436,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
* @return A pointer to the allocated buffer.
|
||||
*/
|
||||
void* alloc(size_t size, size_t* actual_size) override {
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
int nnz = 0;
|
||||
size_t max_size = 0;
|
||||
#endif
|
||||
size_t best_diff = 1ull << 36;
|
||||
int ibest = -1;
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
|
||||
void* ptr = nullptr;
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
|
||||
int i = 0;
|
||||
for (; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cann_buffer& b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
if (b.ptr == nullptr) {
|
||||
break;
|
||||
}
|
||||
if (b.used) {
|
||||
continue;
|
||||
}
|
||||
if (b.size >= size) {
|
||||
// reuse the buffer if the size is enough
|
||||
const size_t margin = b.size - size;
|
||||
if (margin <= max_reuse_margin) {
|
||||
*actual_size = b.size;
|
||||
b.used = true;
|
||||
ptr = b.ptr;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
++nnz;
|
||||
if (b.size > max_size) max_size = b.size;
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: reused %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB, "
|
||||
"margin = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
||||
#endif
|
||||
if (b.size >= size) {
|
||||
size_t diff = b.size - size;
|
||||
if (diff < best_diff) {
|
||||
best_diff = diff;
|
||||
ibest = i;
|
||||
if (!best_diff) {
|
||||
void* ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_clean = !disable_clean &&
|
||||
b.size > min_free_margin &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
||||
if (should_clean) {
|
||||
// free the buffer if the size is needed to be freed
|
||||
ACL_CHECK(aclrtFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: clean %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
||||
#endif
|
||||
b.ptr = nullptr;
|
||||
}
|
||||
}
|
||||
if (ibest >= 0) {
|
||||
ggml_cann_buffer& b = buffer_pool[ibest];
|
||||
void* ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
if (ptr != nullptr) {
|
||||
return ptr;
|
||||
}
|
||||
void* ptr;
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(
|
||||
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = size;
|
||||
pool_size += size;
|
||||
|
||||
if (i < MAX_BUFFERS) {
|
||||
// allocate a new buffer if no buffer can be reused
|
||||
ggml_cann_buffer& b = buffer_pool[i];
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
pool_size += size;
|
||||
*actual_size = size;
|
||||
b.size = size;
|
||||
b.used = true;
|
||||
if (i >= MAX_BUFFERS - 8) {
|
||||
GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
|
||||
}
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||
"requested %u MB\n",
|
||||
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
||||
(uint32_t)(pool_size / 1024 / 1024),
|
||||
(uint32_t)(size / 1024 / 1024));
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: allocate %p, "
|
||||
"pool_size = %5u MB, "
|
||||
"size = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
||||
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
||||
#endif
|
||||
return ptr;
|
||||
return b.ptr;
|
||||
}
|
||||
|
||||
GGML_ABORT("cann pool[%d]: slots full\n", device);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -278,18 +532,24 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
* @param size Size of the buffer to free.
|
||||
*/
|
||||
void free(void* ptr, size_t size) override {
|
||||
GGML_UNUSED(size);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cann_buffer& b = buffer_pool[i];
|
||||
if (b.ptr == nullptr) {
|
||||
b.ptr = ptr;
|
||||
b.size = size;
|
||||
return;
|
||||
if (b.ptr != ptr) {
|
||||
continue;
|
||||
}
|
||||
b.used = false;
|
||||
b.last_used = std::chrono::steady_clock::now();
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"cann pool[%d]: return %p, "
|
||||
"pool_size = %5u MB\n",
|
||||
device, b.ptr,
|
||||
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
// memory should always buffered. these memory may still needed by
|
||||
// tasks in stream.
|
||||
// TODO, fix me.
|
||||
GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
|
||||
GGML_ABORT("cann pool[%d]: slots full\n", device);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -347,8 +607,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_vmm(int device)
|
||||
: device(device),
|
||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
||||
: device(device) {
|
||||
auto dev = ggml_cann_info().devices[device];
|
||||
granularity = dev.vmm_granularity;
|
||||
max_size = dev.total_vram;
|
||||
@@ -471,7 +730,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
*/
|
||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||
int device) {
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
|
||||
if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
|
||||
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
}
|
||||
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
||||
if (enable_buf_prio) {
|
||||
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
||||
}
|
||||
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
||||
}
|
||||
|
||||
// cann buffer
|
||||
@@ -1020,8 +1290,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
|
||||
ggml_cann_set_device(buft_ctx->device);
|
||||
|
||||
size = std::max(size, (size_t)1);
|
||||
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
void* dev_ptr;
|
||||
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
if (err != ACL_SUCCESS) {
|
||||
@@ -1330,12 +1603,13 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
GGML_CANN_CALL_UNARY_OP(Silu);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK: {
|
||||
auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_unary_op<lambda>(ctx, dst);
|
||||
}
|
||||
break;
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_src,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_unary_op(lambda, ctx, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
GGML_CANN_CALL_UNARY_OP(Tanh);
|
||||
break;
|
||||
@@ -1354,6 +1628,15 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_UNARY_OP_EXP:
|
||||
GGML_CANN_CALL_UNARY_OP(Exp);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cann_elu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
GGML_CANN_CALL_UNARY_OP(Sign);
|
||||
break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
ggml_cann_step(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1448,7 +1731,22 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_cann_unary_op<aclnn_sin>(ctx, dst);
|
||||
break;
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cann_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_LOG:
|
||||
GGML_CANN_CALL_UNARY_OP(Log);
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
ggml_cann_mean(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
ggml_cann_pad_reflect_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
ggml_cann_count_equal(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1491,12 +1789,11 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
delete backend;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Sets tensor data asynchronously in the CANN backend.
|
||||
*
|
||||
* This function asynchronously sets tensor data in the CANN backend. Depending
|
||||
* on the tensor type, it may perform data transformations before copying data
|
||||
* to the device.
|
||||
* This function asynchronously sets tensor data in the CANN backend.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend structure.
|
||||
* @param tensor Pointer to the tensor structure to set data for.
|
||||
@@ -1511,23 +1808,28 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||
size_t size) {
|
||||
ggml_backend_cann_context *cann_ctx =
|
||||
(ggml_backend_cann_context *)backend->context;
|
||||
ggml_backend_buffer_t buf =
|
||||
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
|
||||
size, ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
cann_ctx->stream()));
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
||||
"unsupported buffer type");
|
||||
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
||||
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
(char *)tensor->data + offset, size, transform_buffer, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
free(transform_buffer);
|
||||
}
|
||||
ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets tensor data asynchronously in the CANN backend.
|
||||
*
|
||||
* This function asynchronously gets tensor data in the CANN backend.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend structure.
|
||||
* @param tensor Pointer to the tensor structure to get data from.
|
||||
* @param data Pointer to the host data to copy from the tensor.
|
||||
* @param offset Offset in bytes within the host data.
|
||||
* @param size Size of the data to copy in bytes.
|
||||
*/
|
||||
static void ggml_backend_cann_get_tensor_async(
|
||||
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) {
|
||||
@@ -1538,20 +1840,11 @@ static void ggml_backend_cann_get_tensor_async(
|
||||
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
||||
"unsupported buffer type");
|
||||
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
||||
|
||||
ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
|
||||
size, ACL_MEMCPY_DEVICE_TO_HOST,
|
||||
cann_ctx->stream()));
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
transform_buffer, size, (char *)tensor->data + offset, size,
|
||||
ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
||||
free(transform_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1611,6 +1904,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
ggml_cann_set_device(cann_ctx_src->device);
|
||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
||||
|
||||
// wait for task_queue empty to keep task order.
|
||||
cann_ctx_src->task_queue.wait();
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
cann_ctx_src->stream()));
|
||||
@@ -1638,9 +1933,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
cann_ctx->task_queue.wait();
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
}
|
||||
|
||||
@@ -1710,6 +2004,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -1721,6 +2018,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
#ifdef ASCEND_310P
|
||||
// Q4 && Q8 per group is not suppor on 310p device
|
||||
return false;
|
||||
#endif
|
||||
// only support contiguous for quantized types.
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]);
|
||||
@@ -1788,6 +2089,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!ggml_is_contiguous(op->src[0])){
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_UPSCALE: {
|
||||
@@ -1796,10 +2100,19 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
||||
return false;
|
||||
}
|
||||
if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_POOL_2D: {
|
||||
const int32_t * opts = (const int32_t *) op->op_params;
|
||||
#ifdef ASCEND_310P
|
||||
enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
|
||||
if(opt == GGML_OP_POOL_MAX){
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int p0 = opts[5];
|
||||
@@ -1842,6 +2155,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_AVX)
|
||||
else ()
|
||||
elseif (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS /arch:SSE4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
else ()
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
if (GGML_SSE42)
|
||||
list(APPEND ARCH_FLAGS -msse4.2)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_F16C)
|
||||
|
||||
@@ -263,7 +263,7 @@ void test_x86_is() {
|
||||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 0;
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
#ifdef GGML_FMA
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -323,8 +323,6 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
|
||||
@@ -2027,41 +2027,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_rwkv_wkv7(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
{
|
||||
ggml_unary_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_unary(params, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_BINARY:
|
||||
{
|
||||
ggml_binary_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_binary(params, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM1_F32:
|
||||
{
|
||||
ggml_custom1_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom1_f32(params, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM2_F32:
|
||||
{
|
||||
ggml_custom2_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom2_f32(params, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM3_F32:
|
||||
{
|
||||
ggml_custom3_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom3_f32(params, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
{
|
||||
ggml_compute_forward_map_custom1(params, tensor);
|
||||
@@ -2077,6 +2042,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
ggml_compute_forward_map_custom3(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CUSTOM:
|
||||
{
|
||||
ggml_compute_forward_custom(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
{
|
||||
ggml_compute_forward_cross_entropy_loss(params, tensor);
|
||||
@@ -2328,11 +2298,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_WIN_PART:
|
||||
case GGML_OP_WIN_UNPART:
|
||||
case GGML_OP_GET_REL_POS:
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
case GGML_OP_MAP_CUSTOM1_F32:
|
||||
case GGML_OP_MAP_CUSTOM2_F32:
|
||||
case GGML_OP_MAP_CUSTOM3_F32:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
@@ -2366,6 +2331,16 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
n_tasks = MIN(p.n_tasks, n_threads);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CUSTOM:
|
||||
{
|
||||
struct ggml_custom_op_params p;
|
||||
memcpy(&p, node->op_params, sizeof(p));
|
||||
if (p.n_tasks == GGML_N_TASKS_MAX) {
|
||||
n_tasks = n_threads;
|
||||
} else {
|
||||
n_tasks = MIN(p.n_tasks, n_threads);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
|
||||
@@ -425,6 +425,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
}
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
|
||||
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
|
||||
+81
-163
@@ -6351,24 +6351,72 @@ static void ggml_compute_forward_upscale_f32(
|
||||
const float sf2 = (float)ne2/src0->ne[2];
|
||||
const float sf3 = (float)ne3/src0->ne[3];
|
||||
|
||||
// TODO: optimize
|
||||
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
const int64_t i03 = i3 / sf3;
|
||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||
const int64_t i02 = i2 / sf2;
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
const int64_t i01 = i1 / sf1;
|
||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||
const int64_t i00 = i0 / sf0;
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
const int64_t i03 = i3 / sf3;
|
||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||
const int64_t i02 = i2 / sf2;
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
const int64_t i01 = i1 / sf1;
|
||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||
const int64_t i00 = i0 / sf0;
|
||||
|
||||
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
// setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True
|
||||
const float pixel_offset = 0.5f;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
const int64_t i03 = i3 / sf3;
|
||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||
const int64_t i02 = i2 / sf2;
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
|
||||
int64_t y0 = (int64_t)floorf(y);
|
||||
int64_t y1 = y0 + 1;
|
||||
|
||||
y0 = std::max(int64_t(0), std::min(y0, ne01 - 1));
|
||||
y1 = std::max(int64_t(0), std::min(y1, ne01 - 1));
|
||||
|
||||
float dy = y - (float)y0;
|
||||
dy = std::max(0.0f, std::min(dy, 1.0f));
|
||||
|
||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||
const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
|
||||
int64_t x0 = (int64_t)floorf(x);
|
||||
int64_t x1 = x0 + 1;
|
||||
|
||||
x0 = std::max(int64_t(0), std::min(x0, ne00 - 1));
|
||||
x1 = std::max(int64_t(0), std::min(x1, ne00 - 1));
|
||||
|
||||
float dx = x - (float)x0;
|
||||
dx = std::max(0.0f, std::min(dx, 1.0f));
|
||||
|
||||
// fetch the four surrounding pixel values and interpolate
|
||||
const float a = *(const float *)((const char *)src0->data + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
|
||||
const float b = *(const float *)((const char *)src0->data + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
|
||||
const float c = *(const float *)((const char *)src0->data + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
|
||||
const float d = *(const float *)((const char *)src0->data + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
|
||||
|
||||
float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
*y_dst = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("unsupported upscale mode");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6721,8 +6769,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
ggml_vec_dot_t const kq_vec_dot = ggml_get_type_traits_cpu(k->type)->vec_dot;
|
||||
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
||||
|
||||
GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
|
||||
GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
|
||||
GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
|
||||
GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
|
||||
|
||||
// loop over n_batch and n_head
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
@@ -6818,10 +6866,14 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
vs = expf(s - M);
|
||||
}
|
||||
|
||||
v_to_float(v_data, V32, DV);
|
||||
|
||||
// V += v*expf(s - M)
|
||||
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
|
||||
if (v_to_float) {
|
||||
v_to_float(v_data, V32, DV);
|
||||
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
|
||||
} else {
|
||||
// V is F32
|
||||
ggml_vec_mad_f32(DV, VKQ32, (const float *) v_data, vs);
|
||||
}
|
||||
}
|
||||
|
||||
S = S*ms + vs; // scale and increment sum with partial sum
|
||||
@@ -8264,152 +8316,6 @@ void ggml_compute_forward_rwkv_wkv7(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_unary
|
||||
|
||||
static void ggml_compute_forward_map_unary_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fun(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_map_unary(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_unary_f32(params, dst, fun);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_binary
|
||||
|
||||
static void ggml_compute_forward_map_binary_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(src1));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fun(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
||||
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_map_binary(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_binary_f32(params, dst, fun);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom1
|
||||
|
||||
void ggml_compute_forward_map_custom1_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_custom1_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * a = dst->src[0];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
fun(dst, a);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom2
|
||||
|
||||
void ggml_compute_forward_map_custom2_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_custom2_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * a = dst->src[0];
|
||||
const ggml_tensor * b = dst->src[1];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
fun(dst, a, b);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom3
|
||||
|
||||
void ggml_compute_forward_map_custom3_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const ggml_custom3_op_f32_t fun) {
|
||||
|
||||
const ggml_tensor * a = dst->src[0];
|
||||
const ggml_tensor * b = dst->src[1];
|
||||
const ggml_tensor * c = dst->src[1];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
fun(dst, a, b, c);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom1
|
||||
|
||||
void ggml_compute_forward_map_custom1(
|
||||
@@ -8455,6 +8361,18 @@ void ggml_compute_forward_map_custom3(
|
||||
p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_custom
|
||||
|
||||
void ggml_compute_forward_custom(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
struct ggml_custom_op_params p;
|
||||
memcpy(&p, dst->op_params, sizeof(p));
|
||||
|
||||
p.fun(dst, params->ith, params->nth, p.userdata);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_cross_entropy_loss
|
||||
|
||||
static void ggml_compute_forward_cross_entropy_loss_f32(
|
||||
|
||||
+1
-20
@@ -96,29 +96,10 @@ void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params,
|
||||
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_unary(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
void ggml_compute_forward_map_binary(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom1_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom1_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom2_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom2_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom3_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom3_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_custom(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -392,7 +392,11 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
||||
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
||||
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
||||
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
|
||||
static inline unsigned char ggml_endian_byte(int i) {
|
||||
uint16_t tmp_val = 1;
|
||||
return ((unsigned char *)&tmp_val)[i];
|
||||
}
|
||||
#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) \
|
||||
if (i & 0x1) \
|
||||
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
||||
@@ -851,13 +855,17 @@ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return vec_xl(0, tmp);
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
||||
return vec_xl(0, (const float *)(tmp));
|
||||
}
|
||||
|
||||
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
||||
float arr[4];
|
||||
|
||||
vec_xst(y, 0, arr);
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
||||
vec_xst(y, 0, (float *)(arr));
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||
|
||||
@@ -10,6 +10,13 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
half * dsti = (half *) cdsti;
|
||||
@@ -386,6 +393,16 @@ static void ggml_cpy_f32_f32_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_bf16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -534,7 +551,7 @@ static void ggml_cpy_f16_f16_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
@@ -571,7 +588,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
char ** dest_ptrs_d = nullptr;
|
||||
int graph_cpynode_index = -1;
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
}
|
||||
@@ -581,6 +598,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
@@ -617,7 +636,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
#endif
|
||||
@@ -626,7 +645,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
ggml_cuda_cpy(ctx, src0, dst);
|
||||
bool disable_indirection = true;
|
||||
ggml_cuda_cpy(ctx, src0, dst, disable_indirection);
|
||||
}
|
||||
|
||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
@@ -634,6 +654,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
return nullptr;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
#define CUDA_CPY_BLOCK_SIZE 64
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
|
||||
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
cudaError_t err;
|
||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||
{
|
||||
err = cudaMallocManaged(ptr, size);
|
||||
#if defined(GGML_USE_HIP)
|
||||
if (err == hipSuccess) {
|
||||
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
|
||||
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
||||
if (err == hipErrorNotSupported) {
|
||||
static bool warned_unsupported = false;
|
||||
if (!warned_unsupported) {
|
||||
GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
|
||||
warned_unsupported = true;
|
||||
}
|
||||
|
||||
err = cudaMalloc(ptr, size);
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
else
|
||||
{
|
||||
err = cudaMalloc(ptr, size);
|
||||
}
|
||||
return err;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
@@ -2491,7 +2492,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3079,6 +3080,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
@@ -3213,6 +3217,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -3232,6 +3237,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Vendored
+2
@@ -71,6 +71,8 @@
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#define cudaMallocManaged hipMallocManaged
|
||||
#define cudaMemAdvise hipMemAdvise
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
|
||||
@@ -89,10 +89,6 @@ endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_HIP)
|
||||
|
||||
if (GGML_HIP_UMA)
|
||||
add_compile_definitions(GGML_HIP_UMA)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
+20
-13
@@ -16,6 +16,14 @@
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
@@ -140,8 +148,14 @@ struct ggml_map_custom2_op_params {
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
struct ggml_custom_op_params {
|
||||
ggml_custom_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
// bitset
|
||||
@@ -311,13 +325,6 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
||||
//
|
||||
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
@@ -355,8 +362,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
float f;
|
||||
double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
@@ -368,8 +375,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
double d;
|
||||
ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
|
||||
@@ -354,6 +354,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
|
||||
@@ -362,6 +363,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
|
||||
@@ -370,6 +372,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,
|
||||
@@ -378,6 +381,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,
|
||||
@@ -386,6 +390,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,
|
||||
@@ -394,6 +399,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,
|
||||
@@ -402,6 +408,14 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
|
||||
@@ -430,6 +444,13 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_SET_I32,
|
||||
GGML_METAL_KERNEL_TYPE_SET_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
@@ -460,6 +481,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SQRT,
|
||||
GGML_METAL_KERNEL_TYPE_SIN,
|
||||
GGML_METAL_KERNEL_TYPE_COS,
|
||||
GGML_METAL_KERNEL_TYPE_NEG,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
|
||||
@@ -1011,6 +1033,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H192, flash_attn_ext_f16_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128, flash_attn_ext_f16_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512, flash_attn_ext_f16_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && use_bfloat);
|
||||
@@ -1019,6 +1042,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H192, flash_attn_ext_bf16_h192, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128, flash_attn_ext_bf16_hk192_hv128, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512, flash_attn_ext_bf16_hk576_hv512, has_simdgroup_mm && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm);
|
||||
@@ -1027,6 +1051,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H192, flash_attn_ext_q4_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128, flash_attn_ext_q4_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512, flash_attn_ext_q4_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, has_simdgroup_mm);
|
||||
@@ -1035,6 +1060,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H192, flash_attn_ext_q4_1_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128, flash_attn_ext_q4_1_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512, flash_attn_ext_q4_1_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, has_simdgroup_mm);
|
||||
@@ -1043,6 +1069,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H192, flash_attn_ext_q5_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128, flash_attn_ext_q5_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512, flash_attn_ext_q5_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, has_simdgroup_mm);
|
||||
@@ -1051,6 +1078,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H192, flash_attn_ext_q5_1_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128, flash_attn_ext_q5_1_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512, flash_attn_ext_q5_1_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, has_simdgroup_mm);
|
||||
@@ -1059,6 +1087,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192, flash_attn_ext_q8_0_h192, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96, flash_attn_ext_vec_q4_1_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96, flash_attn_ext_vec_q5_0_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96, flash_attn_ext_vec_q5_1_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96, flash_attn_ext_vec_q8_0_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction);
|
||||
@@ -1087,6 +1123,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512, flash_attn_ext_vec_f16_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512, flash_attn_ext_vec_bf16_hk576_hv512, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512, flash_attn_ext_vec_q4_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512, flash_attn_ext_vec_q4_1_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512, flash_attn_ext_vec_q5_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512, flash_attn_ext_vec_q5_1_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512, flash_attn_ext_vec_q8_0_hk576_hv512, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
@@ -1117,6 +1160,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
|
||||
@@ -1278,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1334,8 +1379,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
return false;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -1345,6 +1391,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_ARANGE:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 32) {
|
||||
// head size == 32 (e.g. bert-bge-small)
|
||||
// TODO: not sure if it is worth adding kernels for this size
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek sizes
|
||||
// TODO: disabled for now, until optmized
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
}
|
||||
@@ -1957,6 +2013,18 @@ static void ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
||||
@@ -3837,12 +3905,14 @@ static void ggml_metal_encode_node(
|
||||
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
|
||||
// for now avoiding mainly to keep the number of templates/kernels a bit lower
|
||||
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
|
||||
if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 192)) {
|
||||
if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
|
||||
@@ -3865,6 +3935,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
|
||||
@@ -3887,6 +3959,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break;
|
||||
@@ -3909,6 +3983,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break;
|
||||
@@ -3931,6 +4007,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break;
|
||||
@@ -3953,6 +4031,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break;
|
||||
@@ -3975,6 +4055,8 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
if (ne00 == 192 && ne20 == 128) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128].pipeline;
|
||||
} else if (ne00 == 576 && ne20 == 512) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512].pipeline;
|
||||
} else {
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break;
|
||||
@@ -4004,6 +4086,24 @@ static void ggml_metal_encode_node(
|
||||
use_vec_kernel = true;
|
||||
|
||||
switch (ne00) {
|
||||
case 96:
|
||||
{
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
|
||||
GGML_LOG_ERROR("add template specialization for this type\n");
|
||||
GGML_ABORT("add template specialization for this type");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case 128:
|
||||
{
|
||||
switch (src1->type) {
|
||||
@@ -4076,12 +4176,36 @@ static void ggml_metal_encode_node(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case 576:
|
||||
{
|
||||
if (ne20 == 512) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_HK576_HV512].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_HK576_HV512].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
|
||||
GGML_LOG_ERROR("add template specialization for this type\n");
|
||||
GGML_ABORT("add template specialization for this type");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne20);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -949,6 +949,13 @@ kernel void kernel_cos(
|
||||
dst[tpig] = cos(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_neg(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = -src0[tpig];
|
||||
}
|
||||
|
||||
kernel void kernel_sum_rows(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -3546,6 +3553,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
|
||||
@@ -3556,6 +3564,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
#endif
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
|
||||
@@ -3566,6 +3575,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80, 80>;
|
||||
@@ -3575,6 +3585,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80, 80>;
|
||||
@@ -3584,6 +3595,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80, 80>;
|
||||
@@ -3593,6 +3605,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80, 80>;
|
||||
@@ -3602,6 +3615,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
@@ -3959,6 +3973,16 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 96, 96, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 96, 96, 4>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 96, 96, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 96, 96, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 96, 96, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 96, 96, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 96, 96, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 128, 128, 4>;
|
||||
@@ -3999,6 +4023,16 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 4>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 4>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 576, 512, 2>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_hk576_hv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 576, 512, 2>;
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
template<typename T>
|
||||
|
||||
@@ -54,16 +54,41 @@ function(ggml_opencl_add_kernel KNAME)
|
||||
endfunction()
|
||||
|
||||
set(GGML_OPENCL_KERNELS
|
||||
ggml-opencl
|
||||
ggml-opencl_mm
|
||||
ggml-opencl_cvt
|
||||
ggml-opencl_gemv_noshuffle
|
||||
ggml-opencl_gemv_noshuffle_general
|
||||
ggml-opencl_mul_mat_Ab_Bi_8x4
|
||||
ggml-opencl_transpose_16
|
||||
ggml-opencl_transpose_32
|
||||
ggml-opencl_transpose_32_16
|
||||
ggml-opencl_im2col
|
||||
add
|
||||
clamp
|
||||
cpy
|
||||
cvt
|
||||
diag_mask_inf
|
||||
gelu
|
||||
gemv_noshuffle_general
|
||||
gemv_noshuffle
|
||||
get_rows
|
||||
im2col_f32
|
||||
im2col_f16
|
||||
mul_mat_Ab_Bi_8x4
|
||||
mul_mv_f16_f16
|
||||
mul_mv_f16_f32_1row
|
||||
mul_mv_f16_f32_l4
|
||||
mul_mv_f16_f32
|
||||
mul_mv_f32_f32
|
||||
mul_mv_q4_0_f32
|
||||
mul_mv_q4_0_f32_v
|
||||
mul_mv_q4_0_f32_8x_flat
|
||||
mul_mv_q4_0_f32_1d_8x_flat
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
rms_norm
|
||||
rope
|
||||
scale
|
||||
silu
|
||||
softmax_4_f32
|
||||
softmax_4_f16
|
||||
softmax_f32
|
||||
softmax_f16
|
||||
transpose
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,83 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// add
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
kernel void kernel_add(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] + src1[idx1];
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// clamp
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_clamp(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
float min,
|
||||
float max
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
|
||||
min :
|
||||
(src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// cpy
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
int i3 = n / (ne2*ne1*ne0);
|
||||
int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f16_f32(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
int i3 = n / (ne2*ne1*ne0);
|
||||
int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f16(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
int i3 = n / (ne2*ne1*ne0);
|
||||
int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
int i3 = n / (ne2*ne1*ne0);
|
||||
int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
+39
-27
@@ -1,39 +1,20 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// This file is contains additional kernels for data conversion.
|
||||
// This file is contains kernels for data conversion.
|
||||
// These kernels are used when loading the model, so its performance is less
|
||||
// important.
|
||||
//------------------------------------------------------------------------------
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#elif defined(cl_amd_fp16)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||
#else
|
||||
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_subgroups
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#elif defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#error "Subgroup not supported on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
// Always use subgroup size of 32 on Intel.
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
// Always use subgroups size of 64 on Adreno.
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||
#error "Selecting subgroup size is not supported on your device."
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
@@ -66,13 +47,44 @@ struct block_q4_0
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// mul_vec_q_n_f32_flat_noshuffle
|
||||
//
|
||||
// This variation uses flat arrays (struct of arrays, SOA) representation for
|
||||
// quant tensors. It also uses non shuffled bit order for weights.
|
||||
//
|
||||
// The shuffled version is kept in the original file because moving it here
|
||||
// seems to result in worse performance for adreno.
|
||||
// kernel_convert_block_q4_0
|
||||
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
|
||||
// This kernel does not deshuffle the bits.
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_convert_block_q4_0(
|
||||
global struct block_q4_0 * src0,
|
||||
global uchar * dst_q,
|
||||
global half * dst_d
|
||||
) {
|
||||
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
|
||||
for (int i = 0; i < QK4_0/2; ++i) {
|
||||
q[i] = b->qs[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q4_0(
|
||||
global uchar * src_q,
|
||||
global half * src_d,
|
||||
global struct block_q4_0 * dst
|
||||
) {
|
||||
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
for (int i = 0; i < QK4_0/2; ++i) {
|
||||
b->qs[i] = q[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q4_0_noshuffle
|
||||
// Flatten q4_0 weights and unshuffle the bits
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
kernel void kernel_convert_block_q4_0_noshuffle(
|
||||
@@ -0,0 +1,58 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// diag_mask_inf kernels
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_diag_mask_inf(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int n_past
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i02 = get_global_id(2);
|
||||
int i01 = get_global_id(1);
|
||||
int i00 = get_global_id(0);
|
||||
|
||||
if (i00 > n_past + i01) {
|
||||
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
|
||||
} else {
|
||||
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_diag_mask_inf_8(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int n_past
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
int i = 2*get_global_id(0);
|
||||
|
||||
dst[i+0] = src0[i+0];
|
||||
dst[i+1] = src0[i+1];
|
||||
int i4 = 4*i;
|
||||
int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
|
||||
int i01 = i4/(ne00); i4 -= i01*ne00;
|
||||
int i00 = i4;
|
||||
for (int k = 3; k >= 0; --k) {
|
||||
if (i00 + 4 + k <= n_past + i01) {
|
||||
break;
|
||||
}
|
||||
(&dst[i+1])[k] = -INFINITY;
|
||||
if (i00 + k > n_past + i01) {
|
||||
(&dst[i])[k] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// gelu
|
||||
//------------------------------------------------------------------------------
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
|
||||
kernel void kernel_gelu(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
float x = src0[get_global_id(0)];
|
||||
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
float4 x = src0[get_global_id(0)];
|
||||
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
float x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
float4 x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// dequantize_q4_0_f32, dequantize_q4_0_f16
|
||||
//------------------------------------------------------------------------------
|
||||
void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
|
||||
global ushort * qs = ((global ushort *)xb + 1);
|
||||
float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
float d2 = d1 / 256.f;
|
||||
float md = -8.h * xb->d;
|
||||
ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
ushort mask1 = mask0 << 8;
|
||||
|
||||
reg->s0 = d1 * (qs[0] & mask0) + md;
|
||||
reg->s1 = d2 * (qs[0] & mask1) + md;
|
||||
|
||||
reg->s2 = d1 * (qs[1] & mask0) + md;
|
||||
reg->s3 = d2 * (qs[1] & mask1) + md;
|
||||
|
||||
reg->s4 = d1 * (qs[2] & mask0) + md;
|
||||
reg->s5 = d2 * (qs[2] & mask1) + md;
|
||||
|
||||
reg->s6 = d1 * (qs[3] & mask0) + md;
|
||||
reg->s7 = d2 * (qs[3] & mask1) + md;
|
||||
|
||||
reg->s8 = d1 * (qs[4] & mask0) + md;
|
||||
reg->s9 = d2 * (qs[4] & mask1) + md;
|
||||
|
||||
reg->sa = d1 * (qs[5] & mask0) + md;
|
||||
reg->sb = d2 * (qs[5] & mask1) + md;
|
||||
|
||||
reg->sc = d1 * (qs[6] & mask0) + md;
|
||||
reg->sd = d2 * (qs[6] & mask1) + md;
|
||||
|
||||
reg->se = d1 * (qs[7] & mask0) + md;
|
||||
reg->sf = d2 * (qs[7] & mask1) + md;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// get_rows
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_get_rows_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb1,
|
||||
ulong nb2
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i10 = get_group_id(0);
|
||||
int i11 = get_group_id(1);
|
||||
|
||||
int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
int i02 = i11;
|
||||
|
||||
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
|
||||
((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_f16(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb1,
|
||||
ulong nb2
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int i10 = get_group_id(0);
|
||||
int i11 = get_group_id(1);
|
||||
|
||||
int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
int i02 = i11;
|
||||
|
||||
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
|
||||
((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q4_0(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global int * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb1,
|
||||
ulong nb2
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global int*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
const int NL = 2;
|
||||
|
||||
int i10 = get_group_id(0);
|
||||
int i11 = get_group_id(1);
|
||||
|
||||
int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
int i02 = i11;
|
||||
|
||||
for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
|
||||
float16 temp;
|
||||
dequantize_q4_0_f32(
|
||||
((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
|
||||
*(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,146 +0,0 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#elif defined(cl_amd_fp16)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||
#else
|
||||
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_subgroups
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#elif defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#error "Subgroup not supported on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
// Always use subgroup size of 32 on Intel.
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
// Always use subgroups size of 64 on Adreno.
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||
#error "Selecting subgroup size is not supported on your device."
|
||||
#endif
|
||||
|
||||
kernel void kernel_im2col_f32(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
// threadIdx.x + blockIdx.x * blockDim.x
|
||||
long i = get_global_id(0);
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_im2col_f16(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
long i = get_global_id(0);
|
||||
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,26 +0,0 @@
|
||||
// 16-bit transpose, loading/storing a 4x4 tile of elements
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_transpose_16(
|
||||
__read_only image1d_buffer_t input,
|
||||
__write_only image1d_buffer_t output,
|
||||
const uint rows,
|
||||
const uint cols
|
||||
) {
|
||||
|
||||
const int i = get_global_id(0);
|
||||
const int j = get_global_id(1);
|
||||
const int i_2 = i<<2;
|
||||
const int j_2 = j<<2;
|
||||
|
||||
half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
|
||||
half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
|
||||
half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
|
||||
half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
|
||||
|
||||
write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
||||
write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||
write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||
|
||||
kernel void kernel_transpose_32(
|
||||
__read_only image1d_buffer_t input,
|
||||
__write_only image1d_buffer_t output,
|
||||
const uint rows,
|
||||
const uint cols
|
||||
) {
|
||||
|
||||
const int i = get_global_id(0);
|
||||
const int j = get_global_id(1);
|
||||
const int i_2 = i<<2;
|
||||
const int j_2 = j<<2;
|
||||
|
||||
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
|
||||
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
|
||||
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
|
||||
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
|
||||
|
||||
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
||||
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||
// Only used for activations
|
||||
// converts to FP16
|
||||
// also adds zero padding for non multiple of 8 prompt lengths
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
|
||||
|
||||
const int i = get_global_id(0);
|
||||
const int j = get_global_id(1);
|
||||
const int i_2 = i<<2;
|
||||
const int j_2 = j<<2;
|
||||
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
|
||||
half4 temp1 = {0,0,0,0};
|
||||
half4 temp2 = {0,0,0,0};
|
||||
half4 temp3 = {0,0,0,0};
|
||||
|
||||
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
|
||||
temp0 = read_imageh(input, (j_2+0)*cols+i);
|
||||
}
|
||||
if((j_2+1)*cols+i*4+3 < rows*cols*16){
|
||||
temp1 = read_imageh(input, (j_2+1)*cols+i);
|
||||
}
|
||||
if((j_2+2)*cols+i*4+3 < rows*cols*16){
|
||||
temp2 = read_imageh(input, (j_2+2)*cols+i);
|
||||
}
|
||||
if((j_2+3)*cols+i*4+3 < rows*cols*16){
|
||||
temp3 = read_imageh(input, (j_2+3)*cols+i);
|
||||
}
|
||||
|
||||
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
|
||||
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_im2col_f16(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global half * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
long i = get_global_id(0);
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_im2col_f32(
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
ulong batch_offset,
|
||||
ulong delta_offset,
|
||||
long IW,
|
||||
long IH,
|
||||
long IC,
|
||||
long OW,
|
||||
long OH,
|
||||
long KW,
|
||||
long KH,
|
||||
long pelements,
|
||||
long CHW,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1
|
||||
) {
|
||||
long i = get_global_id(0);
|
||||
if (i >= pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
long ksize = OW * (KH > 1 ? KW : 1);
|
||||
long kx = i / ksize;
|
||||
long kd = kx * ksize;
|
||||
long ky = (i - kd) / OW;
|
||||
long ix = i % OW;
|
||||
|
||||
long oh = get_group_id(1);
|
||||
long batch = get_group_id(2) / IC;
|
||||
long ic = get_group_id(2) % IC;
|
||||
|
||||
long iiw = ix * s0 + kx * d0 - p0;
|
||||
long iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
long offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
long offset_src = ic * delta_offset + batch * batch_offset;
|
||||
dst[offset_dst] = src1[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// mul
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_mul(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_mul_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] * src1[idx1];
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define N_F16_F16 4
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f16_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3)
|
||||
{
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int rb = get_group_id(1)*N_F16_F16;
|
||||
int im = get_group_id(2);
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
|
||||
global half * x = (global half *) (src0 + offset_src0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F16_F16; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global half * y = (global half *) (src1 + offset_src1);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
||||
sumf += (half) x[i] * (half) y[i];
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
global half4 * x4 = (global half4 *)x;
|
||||
for (int row = 0; row < N_F16_F16; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global half * y = (global half *) (src1 + offset_src1);
|
||||
global half4 * y4 = (global half4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
||||
sumf += (half) x4[i].s0 * y4[i].s0;
|
||||
sumf += (half) x4[i].s1 * y4[i].s1;
|
||||
sumf += (half) x4[i].s2 * y4[i].s2;
|
||||
sumf += (half) x4[i].s3 * y4[i].s3;
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
||||
all_sum += (half) x[i] * y[i];
|
||||
}
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int rb = get_group_id(1)*N_F16_F32;
|
||||
int im = get_group_id(2);
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
|
||||
global half * x = (global half *) (src0 + offset_src0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global float * y = (global float *) (src1 + offset_src1);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
||||
sumf += convert_float(x[i]) * y[i];
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
global half4 * x4 = (global half4 *)x;
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global float * y = (global float *) (src1 + offset_src1);
|
||||
global float4 * y4 = (global float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
||||
sumf += convert_float(x4[i].s0) * y4[i].s0;
|
||||
sumf += convert_float(x4[i].s1) * y4[i].s1;
|
||||
sumf += convert_float(x4[i].s2) * y4[i].s2;
|
||||
sumf += convert_float(x4[i].s3) * y4[i].s3;
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
||||
all_sum += (float) x[i] * y[i];
|
||||
}
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global half * x = (global half *) (src0 + offset_src0);
|
||||
global float * y = (global float *) (src1 + offset_src1);
|
||||
|
||||
float sumf = 0;
|
||||
if (ne00 < 128) {
|
||||
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
} else {
|
||||
global half4 * x4 = (global half4 *) x;
|
||||
global float4 * y4 = (global float4 *) y;
|
||||
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
||||
sumf += (float) x4[i].s0 * y4[i].s0;
|
||||
sumf += (float) x4[i].s1 * y4[i].s1;
|
||||
sumf += (float) x4[i].s2 * y4[i].s2;
|
||||
sumf += (float) x4[i].s3 * y4[i].s3;
|
||||
}
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
||||
all_sum += (float) x[i] * y[i];
|
||||
}
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
// Assumes row size (ne00) is a multiple of 4
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f16_f32_l4(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int nrows = ne11;
|
||||
int r0 = get_group_id(0);
|
||||
int im = get_group_id(2);
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
|
||||
global half4 * x4 = (global half4 *) (src0 + offset_src0);
|
||||
|
||||
for (int r1 = 0; r1 < nrows; ++r1) {
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global float4 * y4 = (global float4 *) (src1 + offset_src1);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
||||
sumf += convert_float(x4[i].s0) * y4[i].s0;
|
||||
sumf += convert_float(x4[i].s1) * y4[i].s1;
|
||||
sumf += convert_float(x4[i].s2) * y4[i].s2;
|
||||
sumf += convert_float(x4[i].s3) * y4[i].s3;
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define N_F32_F32 4
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_f32_f32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int rb = get_group_id(1)*N_F32_F32;
|
||||
int im = get_group_id(2);
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
||||
|
||||
global float * x = (global float *) (src0 + offset_src0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global float * y = (global float *) (src1 + offset_src1);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
global float4 * x4 = (global float4 *)x;
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
||||
|
||||
global float * y = (global float *) (src1 + offset_src1);
|
||||
global float4 * y4 = (global float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
|
||||
sumf += (float) x4[i].s0 * y4[i].s0;
|
||||
sumf += (float) x4[i].s1 * y4[i].s1;
|
||||
sumf += (float) x4[i].s2 * y4[i].s2;
|
||||
sumf += (float) x4[i].s3 * y4[i].s3;
|
||||
}
|
||||
|
||||
float all_sum = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) {
|
||||
all_sum += (float) x[i] * y[i];
|
||||
}
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,192 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
#define QR4_0 2
|
||||
#define QK4_1 32
|
||||
#define QR4_1 2
|
||||
#define QK5_0 32
|
||||
#define QR5_0 2
|
||||
#define QK5_1 32
|
||||
#define QR5_1 2
|
||||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QK_K 256
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// mul_vec_q_n_f32
|
||||
//------------------------------------------------------------------------------
|
||||
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
||||
// we assume that the yl's have been multiplied with the appropriate scale factor
|
||||
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||
inline float block_q_4_0_dot_y(
|
||||
global struct block_q4_0 * qb_curr,
|
||||
float sumy,
|
||||
private float * yl,
|
||||
int il
|
||||
) {
|
||||
float d = qb_curr->d;
|
||||
float2 acc = 0.f;
|
||||
global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
|
||||
for (int i = 0; i < 8; i+=2) {
|
||||
acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
|
||||
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||
acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
||||
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc.s0 + acc.s1);
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 4
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32(
|
||||
global void * src0,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
|
||||
const ulong nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
// (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
|
||||
// id of a SIMD group in the grid.
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f};
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix * QK4_0 + il;
|
||||
|
||||
// each thread in a SIMD group deals with half a block.
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
sumy += yb[i] + yb[i+1];
|
||||
yl[i+0] = yb[i+ 0];
|
||||
yl[i+1] = yb[i+ 1]/256.f;
|
||||
sumy += yb[i+16] + yb[i+17];
|
||||
yl[i+8] = yb[i+16]/16.f;
|
||||
yl[i+9] = yb[i+17]/4096.f;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
|
||||
}
|
||||
|
||||
// One thread in a SIMD group (i.e., subgroup) handles a half block,
|
||||
// hence then entire SIMD group handles SIMDWIDTH/2 blocks.
|
||||
// y points to the activation matrix (of type float). Therefore for
|
||||
// one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
|
||||
// SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
|
||||
// floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
// The above does not work for Adreno - it produces incorrect results for
|
||||
// row = 1, 2, 3 and only row = 0 gives the correct result.
|
||||
// If N_DST is changed, the below array must be initialized accordingly.
|
||||
// This also seems to perform better on Intel.
|
||||
float tot[N_DST] = {
|
||||
sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
|
||||
sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_q4_0_f32(
|
||||
global void * src0,
|
||||
ulong offset0,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global void*)((global char*)src0 + offset0);
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -0,0 +1,307 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
#define QR4_0 2
|
||||
#define QK4_1 32
|
||||
#define QR4_1 2
|
||||
#define QK5_0 32
|
||||
#define QR5_0 2
|
||||
#define QK5_1 32
|
||||
#define QR5_1 2
|
||||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QK_K 256
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
inline float mm_block_q_4_0_dot_y_flat(
|
||||
global uchar * x,
|
||||
global half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il
|
||||
) {
|
||||
float d = *dh;
|
||||
global ushort * qs = ((global ushort *)x + il/2);
|
||||
float acc = 0.f;
|
||||
|
||||
acc += yl.s0 * (qs[0] & 0x000F);
|
||||
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc += yl.s2 * (qs[1] & 0x000F);
|
||||
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc += yl.sa * (qs[1] & 0x00F0);
|
||||
acc += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc += yl.s4 * (qs[2] & 0x000F);
|
||||
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc += yl.sc * (qs[2] & 0x00F0);
|
||||
acc += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc += yl.s6 * (qs[3] & 0x000F);
|
||||
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc += yl.se * (qs[3] & 0x00F0);
|
||||
acc += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
return d * (sumy * -8.f + acc);
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 16
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
//
|
||||
// This variant performs 1d blocking with 16x output.
|
||||
// Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
|
||||
//
|
||||
inline void mul_mat_q_n_f32_1d_16x_flat(
|
||||
global uchar * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = get_group_id(2);
|
||||
|
||||
// (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
|
||||
// a SIMD group in the grid. Each SIMD group produces N_DST values in the
|
||||
// result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
|
||||
// Currently with llama2 7B, im is always 0.
|
||||
// TODO: how to handle im/gqa*(nb*ne0)?
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
// The number of scales is the same as the number of blocks.
|
||||
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||
|
||||
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||
global half * d = (global half *) src0_d + offset0_d;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
|
||||
0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix*QK4_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0.f;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||
sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||
sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||
sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||
|
||||
sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||
sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||
sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||
sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||
|
||||
sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 8*nb*QK4_0/2, d + ib + 8*nb, sumy, yl, il);
|
||||
sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 9*nb*QK4_0/2, d + ib + 9*nb, sumy, yl, il);
|
||||
sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
|
||||
sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
|
||||
|
||||
sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
|
||||
sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
|
||||
sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
|
||||
sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
|
||||
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float16 tot = (float16)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
|
||||
|
||||
sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
|
||||
sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
|
||||
sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
|
||||
sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
|
||||
if (first_row + 4 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||
}
|
||||
if (first_row + 5 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||
}
|
||||
if (first_row + 6 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||
}
|
||||
if (first_row + 7 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||
}
|
||||
|
||||
if (first_row + 8 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
|
||||
}
|
||||
if (first_row + 9 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
|
||||
}
|
||||
if (first_row + 10 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
|
||||
}
|
||||
if (first_row + 11 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
|
||||
}
|
||||
|
||||
if (first_row + 12 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
|
||||
}
|
||||
if (first_row + 13 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
|
||||
}
|
||||
if (first_row + 14 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
|
||||
}
|
||||
if (first_row + 15 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
|
||||
global uchar * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user