mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-14 09:46:43 +02:00
Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e805f0fa99 | |||
| af3ba5d946 | |||
| e1e721094d | |||
| 128dcbd3c9 | |||
| 4d0924a890 | |||
| 8ca511cade | |||
| d71ac90985 | |||
| ce32060198 | |||
| 1cfb5372cf | |||
| d3bac7d584 | |||
| 5cb04dbc16 | |||
| efb7bdbbd0 | |||
| 15606309a0 | |||
| b2b9f025e7 | |||
| dabcc5b471 | |||
| f8e9140cb4 | |||
| d62520eb2c | |||
| 01684139c3 | |||
| e8dc55d006 | |||
| e0085fdf7c | |||
| e6f291d158 | |||
| 4003be0e5f | |||
| fea4fd4ba7 | |||
| 8f8ddfcfad | |||
| 6fb50ebbf0 | |||
| 625a699b54 | |||
| a4b07c057a | |||
| 549a1e6cd5 | |||
| 5f14ee0b0c |
@@ -356,6 +356,8 @@ jobs:
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -406,7 +408,7 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
@@ -451,7 +453,7 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
@@ -565,6 +567,31 @@ jobs:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install
|
||||
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
name: EditorConfig Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
@@ -89,3 +89,4 @@ examples/jeopardy/results.txt
|
||||
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
nppBackup
|
||||
|
||||
+7
-7
@@ -423,10 +423,7 @@ if (LLAMA_VULKAN)
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||
add_library(ggml-vulkan OBJECT ggml-vulkan.cpp ggml-vulkan.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
@@ -507,7 +504,11 @@ if (LLAMA_SYCL)
|
||||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
if (WIN32)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_KOMPUTE)
|
||||
@@ -1008,7 +1009,6 @@ add_library(ggml OBJECT
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
@@ -1090,7 +1090,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
||||
@@ -586,8 +586,11 @@ train.o: common/train.cpp common/train.h
|
||||
libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
|
||||
+472
@@ -0,0 +1,472 @@
|
||||
# llama.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Windows](#windows)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Q&A](#q&a)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Support|Windows 11|
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
### Verified
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770, 730M|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
||||
|
||||
### Memory
|
||||
|
||||
The memory is a limitation to run LLM on GPUs.
|
||||
|
||||
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
|
||||
|
||||
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
|
||||
|
||||
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
cd ..
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run-llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
## Windows
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||
|
||||
Note: **The driver is mandatory for compute function**.
|
||||
|
||||
2. Install Visual Studio.
|
||||
|
||||
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
|
||||
|
||||
3. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Enable oneAPI running environment:
|
||||
|
||||
- In Search, input 'oneAPI'.
|
||||
|
||||
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||
|
||||
- In Run:
|
||||
|
||||
In CMD:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
||||
c. Check GPU
|
||||
|
||||
In oneAPI command line:
|
||||
|
||||
```
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||
```
|
||||
|
||||
4. Install cmake & make
|
||||
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||
|
||||
b. Download & install make for Windows provided by mingw-w64
|
||||
|
||||
- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases.
|
||||
|
||||
Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z).
|
||||
|
||||
- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**.
|
||||
|
||||
- Add the **bin** folder path in the Windows system PATH environment.
|
||||
|
||||
### Build locally:
|
||||
|
||||
In oneAPI command line window:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
:: build all binary
|
||||
make -j
|
||||
cd ..
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
- In Search, input 'oneAPI'.
|
||||
|
||||
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||
|
||||
- In Run:
|
||||
|
||||
In CMD:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
build\bin\ls-sycl-device.exe
|
||||
|
||||
or
|
||||
|
||||
build\bin\main.exe
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
set GGML_SYCL_DEVICE=0
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
.\examples\sycl\win-run-llama2.bat
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap** or **--mmap 0**.
|
||||
|
||||
## Q&A
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
- In Windows, no result, not error.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
- Meet compile error.
|
||||
|
||||
Remove folder **build** and try again.
|
||||
|
||||
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
|
||||
|
||||
Please run **sudo sycl-ls**.
|
||||
|
||||
If you see it in result, please add video/render group to your ID:
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Then **relogin**.
|
||||
|
||||
If you do not see it, please check the installation GPU steps again.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support multiple cards.
|
||||
@@ -10,6 +10,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
|
||||
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
@@ -604,7 +607,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
||||
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
-252
@@ -1,252 +0,0 @@
|
||||
# llama.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
+30
-28
@@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
@@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||
if (llama_mlock_supported()) {
|
||||
if (llama_supports_mlock()) {
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported()) {
|
||||
if (llama_supports_mmap()) {
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||
printf(" number of layers to store in VRAM for the draft model\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||
printf(" number of layers to store in VRAM for the draft model\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
}
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
@@ -1520,7 +1520,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
@@ -1649,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
|
||||
+33
-33
@@ -43,40 +43,40 @@ extern char const *LLAMA_BUILD_TARGET;
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = -1; // RNG seed
|
||||
uint32_t seed = -1; // RNG seed
|
||||
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_accept = 0.5f; // speculative decoding accept probability
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_accept = 0.5f; // speculative decoding accept probability
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
+6
-6
@@ -1363,12 +1363,12 @@ bool consume_common_train_arg(
|
||||
*invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params->n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params->n_gpu_layers = std::stoi(argv[i]);
|
||||
} else {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
params->print_usage = true;
|
||||
return true;
|
||||
|
||||
@@ -203,6 +203,8 @@ class Model:
|
||||
return CodeShellModel
|
||||
if model_architecture == "OrionForCausalLM":
|
||||
return OrionModel
|
||||
if model_architecture == "InternLM2ForCausalLM":
|
||||
return InternLM2Model
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
@@ -254,6 +256,8 @@ class Model:
|
||||
return gguf.MODEL_ARCH.CODESHELL
|
||||
if arch == "OrionForCausalLM":
|
||||
return gguf.MODEL_ARCH.ORION
|
||||
if arch == "InternLM2ForCausalLM":
|
||||
return gguf.MODEL_ARCH.INTERNLM2
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
@@ -1344,6 +1348,154 @@ class CodeShellModel(Model):
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
|
||||
class InternLM2Model(Model):
|
||||
def set_vocab(self):
|
||||
# (TODO): Is there a better way?
|
||||
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
|
||||
# \x00 specially and convert it into an emoji character to prevent it from being mistakenly
|
||||
# recognized as an empty string in C++.
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from sentencepiece import sentencepiece_model_pb2 as model
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
for token_id in range(vocab_size):
|
||||
piece = tokenizer.id_to_piece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(token_id)
|
||||
if text == b"\x00":
|
||||
# (TODO): fixme
|
||||
# Hack here and replace the \x00 characters.
|
||||
print(f"InternLM2 convert token '{text}' to '🐉'!")
|
||||
text = "🐉"
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.is_unknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.is_control(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.is_unused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.is_byte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_json = json.load(f)
|
||||
|
||||
for key in added_tokens_json:
|
||||
tokens.append(key.encode("utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("InternLM2")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
|
||||
def post_write_tensors(self, tensor_map, name, data_torch):
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def write_tensors(self):
|
||||
from einops import rearrange
|
||||
|
||||
num_heads = self.hparams.get("num_attention_heads")
|
||||
num_kv_heads = self.hparams.get("num_key_value_heads")
|
||||
hidden_size = self.hparams.get("hidden_size")
|
||||
q_per_kv = num_heads // num_kv_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
model_kv = dict(self.get_tensors())
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
||||
for name, data_torch in model_kv.items():
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
if re.match(qkv_pattern, name):
|
||||
bid = re.findall(qkv_pattern, name)[0]
|
||||
qkv = data_torch
|
||||
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
||||
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
||||
q = rearrange(q, " o g n i -> o (g n i)").T
|
||||
k = rearrange(k, " o g n i -> o (g n i)").T
|
||||
v = rearrange(v, " o g n i -> o (g n i)").T
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
|
||||
else:
|
||||
self.post_write_tensors(tensor_map, name, data_torch)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
|
||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
model_params.tensor_split = t_split.data();
|
||||
|
||||
@@ -23,19 +23,23 @@ usage: ./llama-bench [options]
|
||||
|
||||
options:
|
||||
-h, --help
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-b, --batch-size <n> (default: 512)
|
||||
--memory-f32 <0|1> (default: 0)
|
||||
-t, --threads <n> (default: 16)
|
||||
-ngl N, --n-gpu-layers <n> (default: 99)
|
||||
-mg i, --main-gpu <i> (default: 0)
|
||||
-mmq, --mul-mat-q <0|1> (default: 1)
|
||||
-ts, --tensor_split <ts0/ts1/..>
|
||||
-r, --repetitions <n> (default: 5)
|
||||
-o, --output <csv|json|md|sql> (default: md)
|
||||
-v, --verbose (default: 0)
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-b, --batch-size <n> (default: 512)
|
||||
-ctk <t>, --cache-type-k <t> (default: f16)
|
||||
-ctv <t>, --cache-type-v <t> (default: f16)
|
||||
-t, --threads <n> (default: 112)
|
||||
-ngl, --n-gpu-layers <n> (default: 99)
|
||||
-sm, --split-mode <none|layer|row> (default: layer)
|
||||
-mg, --main-gpu <i> (default: 0)
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
-mmq, --mul-mat-q <0|1> (default: 1)
|
||||
-ts, --tensor_split <ts0/ts1/..> (default: 0)
|
||||
-r, --repetitions <n> (default: 5)
|
||||
-o, --output <csv|json|md|sql> (default: md)
|
||||
-v, --verbose (default: 0)
|
||||
|
||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||
```
|
||||
@@ -51,6 +55,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave
|
||||
|
||||
For a description of the other options, see the [main example](../main/README.md).
|
||||
|
||||
Note:
|
||||
|
||||
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
|
||||
|
||||
## Examples
|
||||
|
||||
### Text generation with different models
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
// utils
|
||||
static uint64_t get_time_ns() {
|
||||
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
int device_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
|
||||
|
||||
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
||||
if (device_list[i] >0 ){
|
||||
char buf[128];
|
||||
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
if (id.length() >2 ) {
|
||||
id.pop_back();
|
||||
}
|
||||
#endif
|
||||
// TODO: other backends
|
||||
return id;
|
||||
@@ -160,7 +177,8 @@ struct cmd_params {
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> mul_mat_q;
|
||||
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<bool> use_mmap;
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
@@ -179,7 +197,8 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
/* tensor_split */ {{}},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ts" || arg == "--tensor-split") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -380,10 +407,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
const std::regex regex{R"([;/]+)"};
|
||||
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
std::vector<float> tensor_split(llama_max_devices());
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
|
||||
return params;
|
||||
@@ -459,7 +487,8 @@ struct cmd_params_instance {
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
@@ -468,6 +497,7 @@ struct cmd_params_instance {
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
mparams.use_mmap = use_mmap;
|
||||
|
||||
return mparams;
|
||||
}
|
||||
@@ -477,6 +507,7 @@ struct cmd_params_instance {
|
||||
n_gpu_layers == other.n_gpu_layers &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu &&
|
||||
use_mmap == other.use_mmap &&
|
||||
tensor_split == other.tensor_split;
|
||||
}
|
||||
|
||||
@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -563,7 +597,9 @@ struct test {
|
||||
static const bool cuda;
|
||||
static const bool opencl;
|
||||
static const bool vulkan;
|
||||
static const bool kompute;
|
||||
static const bool metal;
|
||||
static const bool sycl;
|
||||
static const bool gpu_blas;
|
||||
static const bool blas;
|
||||
static const std::string cpu_info;
|
||||
@@ -581,7 +617,8 @@ struct test {
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
std::string test_time;
|
||||
@@ -604,6 +641,7 @@ struct test {
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
mul_mat_q = inst.mul_mat_q;
|
||||
tensor_split = inst.tensor_split;
|
||||
use_mmap = inst.use_mmap;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
// RFC 3339 date-time format
|
||||
@@ -647,28 +685,35 @@ struct test {
|
||||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
if (kompute) {
|
||||
return "Kompute";
|
||||
}
|
||||
if (metal) {
|
||||
return "Metal";
|
||||
}
|
||||
if (sycl) {
|
||||
return GGML_SYCL_NAME;
|
||||
}
|
||||
if (gpu_blas) {
|
||||
return "GPU BLAS";
|
||||
}
|
||||
if (blas) {
|
||||
return "BLAS";
|
||||
}
|
||||
|
||||
return "CPU";
|
||||
}
|
||||
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload",
|
||||
"mul_mat_q", "tensor_split",
|
||||
"mul_mat_q", "tensor_split", "use_mmap",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
@@ -686,8 +731,9 @@ struct test {
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "mul_mat_q" || field == "use_mmap") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
@@ -699,7 +745,7 @@ struct test {
|
||||
std::vector<std::string> get_values() const {
|
||||
std::string tensor_split_str;
|
||||
int max_nonzero = 0;
|
||||
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
||||
for (size_t i = 0; i < llama_max_devices(); i++) {
|
||||
if (tensor_split[i] > 0) {
|
||||
max_nonzero = i;
|
||||
}
|
||||
@@ -714,13 +760,14 @@ struct test {
|
||||
}
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
||||
std::to_string(mul_mat_q), tensor_split_str,
|
||||
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
||||
@@ -743,9 +790,11 @@ const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||
const bool test::blas = !!ggml_cpu_has_blas();
|
||||
const bool test::sycl = !!ggml_cpu_has_sycl();
|
||||
const std::string test::cpu_info = get_cpu_info();
|
||||
const std::string test::gpu_info = get_gpu_info();
|
||||
|
||||
@@ -888,6 +937,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_kv_offload") {
|
||||
return "nkvo";
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return "mmap";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -931,6 +983,9 @@ struct markdown_printer : public printer {
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.push_back("tensor_split");
|
||||
}
|
||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||
fields.push_back("use_mmap");
|
||||
}
|
||||
fields.push_back("test");
|
||||
fields.push_back("t/s");
|
||||
|
||||
|
||||
@@ -111,17 +111,71 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m
|
||||
llama_print_timings: total time = 34570.79 ms
|
||||
```
|
||||
|
||||
## Orin compile and run
|
||||
### compile
|
||||
```sh
|
||||
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||
```
|
||||
|
||||
### run on Orin
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
--image /data/local/tmp/demo.jpeg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \
|
||||
--n-gpu-layers 999
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
|
||||
encode_image_with_clip: image encoded in 296.62 ms by CLIP ( 2.06 ms per image patch)
|
||||
|
||||
Susan Wise Bauer
|
||||
|
||||
llama_print_timings: load time = 1067.64 ms
|
||||
llama_print_timings: sample time = 1.53 ms / 6 runs ( 0.25 ms per token, 3934.43 tokens per second)
|
||||
llama_print_timings: prompt eval time = 306.84 ms / 246 tokens ( 1.25 ms per token, 801.72 tokens per second)
|
||||
llama_print_timings: eval time = 91.50 ms / 6 runs ( 15.25 ms per token, 65.58 tokens per second)
|
||||
llama_print_timings: total time = 1352.63 ms / 252 tokens
|
||||
```
|
||||
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||
--n-gpu-layers 999
|
||||
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 302.15 ms by CLIP ( 2.10 ms per image patch)
|
||||
|
||||
The image features a cat lying in the grass.
|
||||
|
||||
llama_print_timings: load time = 1057.07 ms
|
||||
llama_print_timings: sample time = 3.27 ms / 11 runs ( 0.30 ms per token, 3360.83 tokens per second)
|
||||
llama_print_timings: prompt eval time = 213.60 ms / 232 tokens ( 0.92 ms per token, 1086.14 tokens per second)
|
||||
llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 ms per token, 66.01 tokens per second)
|
||||
llama_print_timings: total time = 1365.47 ms / 243 tokens
|
||||
```
|
||||
|
||||
## Minor shortcomings
|
||||
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
|
||||
|
||||
## TODO
|
||||
|
||||
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [ ] Optimize LDP projector performance
|
||||
|
||||
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
|
||||
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
|
||||
- [ ] run MobileVLM on `Jetson Orin`
|
||||
- [x] run MobileVLM on `Jetson Orin`
|
||||
- [ ] Support more model variants, such as `MobileVLM-3B`.
|
||||
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@ chat_completion() {
|
||||
top_p: 0.9,
|
||||
n_keep: $n_keep,
|
||||
n_predict: 256,
|
||||
cache_prompt: true,
|
||||
stop: ["\n### Human:"],
|
||||
stream: true
|
||||
}')"
|
||||
|
||||
+73
-64
@@ -185,7 +185,7 @@ struct llama_client_slot
|
||||
llama_sampling_context *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1;// group-attention factor
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
int32_t ga_w = 512; // group-attention width
|
||||
|
||||
int32_t n_past_se = 0; // self-extend
|
||||
@@ -219,7 +219,8 @@ struct llama_client_slot
|
||||
sent_token_probs_index = 0;
|
||||
infill = false;
|
||||
ga_i = 0;
|
||||
n_past_se = 0;
|
||||
n_past_se = 0;
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
@@ -1227,7 +1228,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@@ -1295,6 +1296,8 @@ struct llama_server_context
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
slot.cache_tokens.clear();
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1364,26 +1367,26 @@ struct llama_server_context
|
||||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
queue_tasks.post(task);
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
if (slot.ga_n == 1)
|
||||
{
|
||||
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
@@ -1429,8 +1432,10 @@ struct llama_server_context
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@@ -1481,8 +1486,8 @@ struct llama_server_context
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
}
|
||||
@@ -1582,8 +1587,8 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
||||
});
|
||||
|
||||
@@ -1591,10 +1596,13 @@ struct llama_server_context
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
int ga_i = slot.ga_i;
|
||||
|
||||
int32_t ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
int32_t ga_w = slot.ga_w;
|
||||
|
||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||
{
|
||||
if (slot.ga_n != 1)
|
||||
@@ -1606,7 +1614,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast += 1;
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
@@ -1666,6 +1674,7 @@ struct llama_server_context
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch batch_view =
|
||||
{
|
||||
n_tokens,
|
||||
@@ -1780,53 +1789,53 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
if (llama_supports_mlock())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported())
|
||||
if (llama_supports_mmap())
|
||||
{
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
#endif
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf("\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -2057,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
} else {
|
||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
"See main README.md for information on enabling GPU BLAS support",
|
||||
{{"n_gpu_layers", params.n_gpu_layers}});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (arg == "--split-mode" || arg == "-sm")
|
||||
{
|
||||
@@ -2106,9 +2115,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
|
||||
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
|
||||
{
|
||||
if (i_device < split_arg.size())
|
||||
{
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
:: build all binary
|
||||
make -j
|
||||
cd ..
|
||||
@@ -0,0 +1,13 @@
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
|
||||
set GGML_SYCL_DEVICE=0
|
||||
rem set GGML_SYCL_DEBUG=1
|
||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
+279
-67
@@ -524,6 +524,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
||||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
#define CUDA_RELU_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
||||
#define CUDA_SQR_BLOCK_SIZE 256
|
||||
#define CUDA_CPY_BLOCK_SIZE 32
|
||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||
@@ -540,6 +542,7 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
|
||||
#define CUDA_PAD_BLOCK_SIZE 256
|
||||
#define CUDA_ACC_BLOCK_SIZE 256
|
||||
#define CUDA_IM2COL_BLOCK_SIZE 256
|
||||
#define CUDA_POOL2D_BLOCK_SIZE 256
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
@@ -823,6 +826,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
||||
dst[i] = fmaxf(x[i], 0);
|
||||
}
|
||||
|
||||
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
@@ -5511,27 +5532,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const half * xi = (const half *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@@ -5625,23 +5656,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@@ -5810,7 +5844,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
||||
}
|
||||
|
||||
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.y;
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
@@ -6132,9 +6166,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
||||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||
}
|
||||
|
||||
static __global__ void im2col_f32_f16(
|
||||
const float * x, half * dst,
|
||||
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
||||
template <typename T>
|
||||
static __global__ void im2col_kernel(
|
||||
const float * x, T * dst, int batch_offset,
|
||||
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
|
||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i >= pelements) {
|
||||
@@ -6147,21 +6182,73 @@ static __global__ void im2col_f32_f16(
|
||||
const int ky = (i - kd) / OW;
|
||||
const int ix = i % OW;
|
||||
|
||||
const int oh = blockIdx.y;
|
||||
const int batch = blockIdx.z / IC;
|
||||
const int ic = blockIdx.z % IC;
|
||||
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
(blockIdx.y * OW + ix) * CHW +
|
||||
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = __float2half(0.0f);
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = blockIdx.z * offset_delta;
|
||||
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
||||
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Ti, typename To>
|
||||
static __global__ void pool2d_nchw_kernel(
|
||||
const int ih, const int iw, const int oh, const int ow,
|
||||
const int kh, const int kw, const int sh, const int sw,
|
||||
const int ph, const int pw, const int parallel_elements,
|
||||
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx >= parallel_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int I_HW = ih * iw;
|
||||
const int O_HW = oh * ow;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / ow;
|
||||
const int cur_ow = idx % O_HW % ow;
|
||||
const Ti* i_ptr = src + nc * I_HW;
|
||||
To* o_ptr = dst + nc * O_HW;
|
||||
const int start_h = cur_oh * sh - ph;
|
||||
const int bh = max(0, start_h);
|
||||
const int eh = min(ih, start_h + kh);
|
||||
const int start_w = cur_ow * sw - pw;
|
||||
const int bw = max(0, start_w);
|
||||
const int ew = min(iw, start_w + kw);
|
||||
const To scale = 1. / (kh * kw);
|
||||
To res = 0;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res = 0; break;
|
||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||
}
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
Ti cur = __ldg(i_ptr + i * iw + j);
|
||||
#else
|
||||
Ti cur = i_ptr[i * iw + j];
|
||||
#endif
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
||||
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
@@ -6375,6 +6462,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
||||
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
||||
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
|
||||
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
||||
@@ -7308,69 +7405,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
||||
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
||||
}
|
||||
|
||||
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
||||
@@ -7449,7 +7559,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
||||
|
||||
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(1, nrows, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
}
|
||||
|
||||
@@ -7561,14 +7671,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
||||
}
|
||||
}
|
||||
|
||||
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
||||
template <typename T>
|
||||
static void im2col_cuda(const float* x, T* dst,
|
||||
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
||||
int offset_delta,
|
||||
int batch, int batch_offset, int offset_delta,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
const int parallel_elements = OW * KW * KH;
|
||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks, OH, IC);
|
||||
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
dim3 block_nums(num_blocks, OH, batch * IC);
|
||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
}
|
||||
|
||||
// buffer pool for cuda
|
||||
@@ -8153,6 +8264,34 @@ static void ggml_cuda_op_relu(
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_hardsigmoid(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_hardswish(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_leaky_relu(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
@@ -8518,9 +8657,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
|
||||
if (src1_convert_f16) {
|
||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
||||
ne00, 1, sizeof(float), 0, 0,
|
||||
ne00, 1, sizeof(half), 0, 0, stream);
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
|
||||
}
|
||||
#else
|
||||
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
||||
@@ -8784,13 +8923,46 @@ static void ggml_cuda_op_alibi(
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_pool2d(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int s0 = opts[3];
|
||||
const int s1 = opts[4];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
|
||||
const int64_t IH = src0->ne[1];
|
||||
const int64_t IW = src0->ne[0];
|
||||
|
||||
const int64_t N = dst->ne[3];
|
||||
const int64_t OC = dst->ne[2];
|
||||
const int64_t OH = dst->ne[1];
|
||||
const int64_t OW = dst->ne[0];
|
||||
|
||||
const int parallel_elements = N * OC * OH * OW;
|
||||
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks);
|
||||
pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
|
||||
|
||||
(void) src1;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_im2col(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
@@ -8812,8 +8984,14 @@ static void ggml_cuda_op_im2col(
|
||||
const int64_t OW = dst->ne[1];
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t batch = src1->ne[3];
|
||||
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
} else {
|
||||
im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
}
|
||||
|
||||
(void) src0;
|
||||
(void) src0_dd;
|
||||
@@ -9409,6 +9587,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
||||
}
|
||||
|
||||
static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
|
||||
}
|
||||
|
||||
static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
|
||||
}
|
||||
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
||||
}
|
||||
@@ -10119,19 +10304,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
//GGML_ASSERT(src0->ne[3] == 1);
|
||||
|
||||
const int64_t nb00 = src0->nb[0];
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
const int64_t nb03 = src0->nb[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
//GGML_ASSERT(src1->ne[3] == 1);
|
||||
|
||||
const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
@@ -10143,17 +10334,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@@ -10186,6 +10379,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
||||
}
|
||||
|
||||
static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
|
||||
}
|
||||
|
||||
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
||||
}
|
||||
@@ -10287,6 +10484,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
case GGML_UNARY_OP_RELU:
|
||||
func = ggml_cuda_relu;
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
func = ggml_cuda_hardsigmoid;
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
func = ggml_cuda_hardswish;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -10361,6 +10564,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
case GGML_OP_IM2COL:
|
||||
func = ggml_cuda_im2col;
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
func = ggml_cuda_pool2d;
|
||||
break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
func = ggml_cuda_sum_rows;
|
||||
break;
|
||||
@@ -11089,6 +11295,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return true;
|
||||
@@ -11156,6 +11364,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
@@ -11184,6 +11395,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ALIBI:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
||||
@@ -57,6 +57,9 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(voi
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
+46
-9
@@ -135,6 +135,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -168,6 +169,8 @@ struct ggml_metal_context {
|
||||
|
||||
bool support_simdgroup_reduction;
|
||||
bool support_simdgroup_mm;
|
||||
|
||||
bool should_capture_next_compute;
|
||||
};
|
||||
|
||||
// MSL code
|
||||
@@ -354,6 +357,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||
@@ -502,6 +507,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -626,6 +632,10 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_ALIBI:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_POOL_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
return false;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -687,6 +697,20 @@ static bool ggml_metal_graph_compute(
|
||||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
const bool should_capture = ctx->should_capture_next_compute;
|
||||
if (should_capture) {
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||
descriptor.captureObject = ctx->queue;
|
||||
|
||||
NSError * error = nil;
|
||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||
GGML_ASSERT(!"capture failed");
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> command_buffer_builder[n_cb];
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
@@ -695,6 +719,7 @@ static bool ggml_metal_graph_compute(
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffer enqueue];
|
||||
}
|
||||
|
||||
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||
|
||||
dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
|
||||
@@ -741,9 +766,9 @@ static bool ggml_metal_graph_compute(
|
||||
GGML_ASSERT(!"unsupported op");
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
||||
#endif
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
@@ -1996,7 +2021,7 @@ static bool ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||
@@ -2004,6 +2029,7 @@ static bool ggml_metal_graph_compute(
|
||||
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
||||
|
||||
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
||||
|
||||
const int32_t N = src1->ne[is_2D ? 3 : 2];
|
||||
@@ -2024,8 +2050,8 @@ static bool ggml_metal_graph_compute(
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
@@ -2218,9 +2244,9 @@ static bool ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
[encoder popDebugGroup];
|
||||
#endif
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
}
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
@@ -2242,6 +2268,10 @@ static bool ggml_metal_graph_compute(
|
||||
}
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2613,6 +2643,13 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||
}
|
||||
|
||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
ctx->should_capture_next_compute = true;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
||||
|
||||
+59
-34
@@ -1775,9 +1775,29 @@ kernel void kernel_rope(
|
||||
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
||||
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
||||
|
||||
kernel void kernel_im2col_f16(
|
||||
typedef void (im2col_t)(
|
||||
device const float * x,
|
||||
device half * dst,
|
||||
device char * dst,
|
||||
constant int32_t & ofs0,
|
||||
constant int32_t & ofs1,
|
||||
constant int32_t & IW,
|
||||
constant int32_t & IH,
|
||||
constant int32_t & CHW,
|
||||
constant int32_t & s0,
|
||||
constant int32_t & s1,
|
||||
constant int32_t & p0,
|
||||
constant int32_t & p1,
|
||||
constant int32_t & d0,
|
||||
constant int32_t & d1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col(
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
constant int32_t & ofs0,
|
||||
constant int32_t & ofs1,
|
||||
constant int32_t & IW,
|
||||
@@ -1800,14 +1820,19 @@ kernel void kernel_im2col_f16(
|
||||
(tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
|
||||
(tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
pdst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
pdst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
kernel void kernel_upscale_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
@@ -3688,38 +3713,38 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3c, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043c1c, 0x04043c2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3c04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3c,
|
||||
0x04142c0c, 0x04142c3c, 0x04143c2c, 0x041c040c, 0x041c043c, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3c04, 0x04240c1c, 0x04241c3c, 0x04242424, 0x04242c3c, 0x04243c1c, 0x04243c2c, 0x042c040c,
|
||||
0x042c043c, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043c0c04, 0x043c0c24, 0x043c0c34,
|
||||
0x043c241c, 0x043c340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243c, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143c14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3c0c, 0x0c34042c, 0x0c3c1414,
|
||||
0x0c3c2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343c, 0x140c3c04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3c,
|
||||
0x14141404, 0x14141414, 0x14141c3c, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3c04, 0x141c3c24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143c, 0x142c240c, 0x142c3c24,
|
||||
0x143c040c, 0x143c041c, 0x143c0c34, 0x143c242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043c14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143c14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243c, 0x1c243c14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3c1c1c, 0x1c3c3404, 0x24040424, 0x24040c3c,
|
||||
0x24041c2c, 0x24041c3c, 0x24042c1c, 0x24042c3c, 0x240c3c24, 0x24141404, 0x24141c3c, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043c, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3c04, 0x243c042c, 0x243c0c04, 0x243c0c14, 0x243c1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043c04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143c14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143c, 0x2c243c14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3c1424, 0x2c3c2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3c, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343c041c, 0x343c140c, 0x3c04041c, 0x3c04042c, 0x3c04043c, 0x3c040c04, 0x3c041c14,
|
||||
0x3c042c14, 0x3c0c1434, 0x3c0c2404, 0x3c140c14, 0x3c14242c, 0x3c142c14, 0x3c1c0404, 0x3c1c0c2c,
|
||||
0x3c1c1c1c, 0x3c1c3404, 0x3c24140c, 0x3c24240c, 0x3c2c0404, 0x3c2c0414, 0x3c2c1424, 0x3c341c04,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
|
||||
|
||||
+65
-10
@@ -1,7 +1,14 @@
|
||||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
@@ -330,6 +337,7 @@ namespace dpct
|
||||
}
|
||||
size_t get_global_mem_size() const { return _global_mem_size; }
|
||||
size_t get_local_mem_size() const { return _local_mem_size; }
|
||||
size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
|
||||
/// Returns the maximum clock rate of device's global memory in kHz. If
|
||||
/// compiler does not support this API then returns default value 3200000 kHz.
|
||||
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
|
||||
@@ -391,6 +399,10 @@ namespace dpct
|
||||
{
|
||||
_local_mem_size = local_mem_size;
|
||||
}
|
||||
void set_max_mem_alloc_size(size_t max_mem_alloc_size)
|
||||
{
|
||||
_max_mem_alloc_size = max_mem_alloc_size;
|
||||
}
|
||||
void set_max_work_group_size(int max_work_group_size)
|
||||
{
|
||||
_max_work_group_size = max_work_group_size;
|
||||
@@ -458,6 +470,7 @@ namespace dpct
|
||||
int _max_register_size_per_work_group;
|
||||
size_t _global_mem_size;
|
||||
size_t _local_mem_size;
|
||||
size_t _max_mem_alloc_size;
|
||||
size_t _max_nd_range_size[3];
|
||||
int _max_nd_range_size_i[3];
|
||||
uint32_t _device_id;
|
||||
@@ -509,6 +522,7 @@ namespace dpct
|
||||
dev.get_info<sycl::info::device::max_work_group_size>());
|
||||
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
|
||||
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
|
||||
prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
|
||||
|
||||
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
|
||||
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
|
||||
@@ -637,6 +651,11 @@ namespace dpct
|
||||
return get_device_info().get_global_mem_size();
|
||||
}
|
||||
|
||||
size_t get_max_mem_alloc_size() const
|
||||
{
|
||||
return get_device_info().get_max_mem_alloc_size();
|
||||
}
|
||||
|
||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
||||
@@ -2921,7 +2940,6 @@ void ggml_sycl_set_main_device(int main_device);
|
||||
void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
|
||||
void ggml_sycl_set_scratch_size(size_t scratch_size);
|
||||
void ggml_sycl_free_scratch(void);
|
||||
int ggml_sycl_get_device_count(void);
|
||||
void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
|
||||
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||
@@ -11305,10 +11323,10 @@ void ggml_init_sycl() try {
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
|
||||
#if defined(GGML_SYCL_FP16)
|
||||
fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__);
|
||||
#if defined(GGML_SYCL_F16)
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__);
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -14486,6 +14504,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
||||
int max_compute_units = -1;
|
||||
for(int i=0;i<max_len;i++) id_list[i] = 0;
|
||||
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for(int id=0; id< device_count; id++){
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu()) continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
|
||||
}
|
||||
|
||||
for(int id=0;id< device_count;id++){
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu()) continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
|
||||
id_list[id] = 1;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
int ggml_sycl_get_device_count() try {
|
||||
int device_count;
|
||||
if (CHECK_TRY_ERROR(device_count =
|
||||
@@ -14500,7 +14549,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void ggml_sycl_get_device_description(int device, char *description,
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
||||
size_t description_size) try {
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
@@ -14751,6 +14800,12 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return dpct::get_current_device().get_max_mem_alloc_size();
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
@@ -14781,7 +14836,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
||||
/* .is_host = */ nullptr,
|
||||
|
||||
+7
-5
@@ -1,7 +1,8 @@
|
||||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -21,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
+759
-1325
File diff suppressed because it is too large
Load Diff
+23
-8
@@ -817,7 +817,7 @@ static void ggml_vk_load_shaders() {
|
||||
// mulmat
|
||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 8, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
||||
|
||||
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
||||
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
||||
@@ -2873,8 +2873,12 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
if (op == GGML_OP_CPY) {
|
||||
GGML_ASSERT(!transfer_src0);
|
||||
GGML_ASSERT(!transfer_src1);
|
||||
d_sz = dst->ne[1] * dst->nb[1];
|
||||
x_sz = ggml_nbytes(src0);
|
||||
d_sz = ggml_nbytes(dst);
|
||||
|
||||
if (extra_src0->offset + x_sz >= d_X->size) {
|
||||
x_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (extra->offset + d_sz >= d_D->size) {
|
||||
d_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
@@ -2910,12 +2914,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
||||
break;
|
||||
}
|
||||
|
||||
x_sz *= ne02 * ne03;
|
||||
if (y_sz != VK_WHOLE_SIZE) {
|
||||
y_sz *= ne12 * ne13;
|
||||
}
|
||||
if (op != GGML_OP_CPY) {
|
||||
d_sz *= ne02 * ne03;
|
||||
if (x_sz != VK_WHOLE_SIZE) {
|
||||
x_sz *= ne02 * ne03;
|
||||
}
|
||||
if (y_sz != VK_WHOLE_SIZE) {
|
||||
y_sz *= ne12 * ne13;
|
||||
}
|
||||
if (d_sz != VK_WHOLE_SIZE) {
|
||||
d_sz *= ne02 * ne03;
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_src1 && op == GGML_OP_SOFT_MAX) {
|
||||
@@ -4556,8 +4564,15 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||
}
|
||||
ggml_vk_preallocate_buffers();
|
||||
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||
ggml_vk_build_graph(cgraph->nodes[i], i == last_node);
|
||||
}
|
||||
|
||||
ggml_compute_params params = {};
|
||||
|
||||
@@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
break;
|
||||
}
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
return NULL;
|
||||
}
|
||||
return aligned_memory;
|
||||
@@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = malloc(size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// calloc
|
||||
inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
if (num == 0 || size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = calloc(num, size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#define GGML_MALLOC(size) ggml_malloc(size)
|
||||
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
||||
|
||||
#define GGML_FREE(ptr) free(ptr)
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
@@ -5316,7 +5349,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
@@ -5394,16 +5427,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
@@ -5424,7 +5456,8 @@ struct ggml_tensor * ggml_im2col(
|
||||
int p1,
|
||||
int d0,
|
||||
int d1,
|
||||
bool is_2D) {
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type) {
|
||||
|
||||
if(is_2D) {
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||
@@ -5448,7 +5481,7 @@ struct ggml_tensor * ggml_im2col(
|
||||
is_2D ? b->ne[3] : 1,
|
||||
};
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
||||
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
@@ -5473,7 +5506,7 @@ struct ggml_tensor * ggml_conv_2d(
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
@@ -5599,12 +5632,13 @@ struct ggml_tensor * ggml_pool_2d(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result;
|
||||
const int64_t ne[3] = {
|
||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
||||
a->ne[2],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
|
||||
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
@@ -5612,7 +5646,6 @@ struct ggml_tensor * ggml_pool_2d(
|
||||
result->op = GGML_OP_POOL_2D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -12460,6 +12493,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
||||
}
|
||||
}
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
static void ggml_compute_forward_im2col_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
||||
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t N = is_2D ? ne13 : ne12;
|
||||
const int64_t IC = is_2D ? ne12 : ne11;
|
||||
const int64_t IH = is_2D ? ne11 : 1;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
const int64_t KH = is_2D ? ne01 : 1;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OH = is_2D ? ne2 : 1;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
int ofs0 = is_2D ? nb13 : nb12;
|
||||
int ofs1 = is_2D ? nb12 : nb11;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
{
|
||||
float * const wdata = (float *) dst->data;
|
||||
|
||||
for (int64_t in = 0; in < N; in++) {
|
||||
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
||||
for (int64_t iow = 0; iow < OW; iow++) {
|
||||
for (int64_t iic = ith; iic < IC; iic += nth) {
|
||||
|
||||
// micro kernel
|
||||
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
||||
|
||||
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
||||
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
||||
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
||||
} else {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
@@ -12550,14 +12669,14 @@ static void ggml_compute_forward_im2col(
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -12748,8 +12867,8 @@ static void ggml_compute_forward_pool_2d(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(src->type == GGML_TYPE_F32);
|
||||
assert(params->ith == 0);
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(params->ith == 0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
@@ -15149,13 +15268,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
size = ggml_hash_size(size);
|
||||
struct ggml_hash_set result;
|
||||
result.size = size;
|
||||
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
||||
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
||||
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
||||
free(hash_set.keys);
|
||||
GGML_FREE(hash_set.keys);
|
||||
}
|
||||
|
||||
struct hash_map {
|
||||
@@ -15164,17 +15283,17 @@ struct hash_map {
|
||||
};
|
||||
|
||||
static struct hash_map * ggml_new_hash_map(size_t size) {
|
||||
struct hash_map * result = malloc(sizeof(struct hash_map));
|
||||
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
||||
result->set = ggml_hash_set_new(size);
|
||||
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_map_free(struct hash_map * map) {
|
||||
ggml_hash_set_free(map->set);
|
||||
free(map->vals);
|
||||
free(map);
|
||||
GGML_FREE(map->vals);
|
||||
GGML_FREE(map);
|
||||
}
|
||||
|
||||
// gradient checkpointing
|
||||
@@ -16952,12 +17071,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
struct ggml_cplan cplan;
|
||||
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
||||
|
||||
int max_tasks = 1;
|
||||
|
||||
// thread scheduling for the different operations + work buffer size estimation
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
max_tasks = MAX(max_tasks, n_tasks);
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
switch (node->op) {
|
||||
@@ -17124,7 +17247,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||
}
|
||||
|
||||
cplan.n_threads = n_threads;
|
||||
cplan.n_threads = MIN(max_tasks, n_threads);
|
||||
cplan.work_size = work_size;
|
||||
cplan.work_data = NULL;
|
||||
|
||||
@@ -19245,6 +19368,25 @@ struct gguf_context {
|
||||
void * data;
|
||||
};
|
||||
|
||||
static size_t gguf_type_size(enum gguf_type type) {
|
||||
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
||||
return GGUF_TYPE_SIZE[type];
|
||||
}
|
||||
|
||||
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
||||
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
||||
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
||||
|
||||
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
||||
GGML_ASSERT(info->ne[i] > 0);
|
||||
}
|
||||
|
||||
// prevent overflow for total number of elements
|
||||
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
||||
}
|
||||
|
||||
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
||||
const size_t n = fread(dst, 1, size, file);
|
||||
*offset += n;
|
||||
@@ -19257,8 +19399,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
|
||||
bool ok = true;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
||||
|
||||
// early exit if string length is invalid, prevents from integer overflow
|
||||
if (p->n == SIZE_MAX) {
|
||||
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
||||
return false;
|
||||
}
|
||||
|
||||
p->data = GGML_CALLOC(p->n + 1, 1);
|
||||
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
@@ -19330,6 +19481,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// sanity-checks to prevent from integer/buffer overflows
|
||||
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
||||
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
fclose(file);
|
||||
@@ -19340,7 +19497,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
@@ -19368,7 +19525,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
@@ -19383,21 +19540,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
@@ -19415,7 +19590,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the tensor infos
|
||||
{
|
||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
@@ -19426,12 +19601,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
|
||||
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
||||
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
gguf_tensor_info_sanitize(info);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
@@ -19585,12 +19766,12 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
if (kv->key.data) {
|
||||
free(kv->key.data);
|
||||
GGML_FREE(kv->key.data);
|
||||
}
|
||||
|
||||
if (kv->type == GGUF_TYPE_STRING) {
|
||||
if (kv->value.str.data) {
|
||||
free(kv->value.str.data);
|
||||
GGML_FREE(kv->value.str.data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19600,16 +19781,16 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
||||
if (str->data) {
|
||||
free(str->data);
|
||||
GGML_FREE(str->data);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(kv->value.arr.data);
|
||||
GGML_FREE(kv->value.arr.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->kv);
|
||||
GGML_FREE(ctx->kv);
|
||||
}
|
||||
|
||||
if (ctx->infos) {
|
||||
@@ -19617,11 +19798,11 @@ void gguf_free(struct gguf_context * ctx) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
if (info->name.data) {
|
||||
free(info->name.data);
|
||||
GGML_FREE(info->name.data);
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->infos);
|
||||
GGML_FREE(ctx->infos);
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
@@ -19922,8 +20103,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = type;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
||||
}
|
||||
|
||||
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
||||
@@ -19932,7 +20113,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
||||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
||||
for (int i = 0; i < n; i++) {
|
||||
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||
str->n = strlen(data[i]);
|
||||
@@ -19959,19 +20140,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||
}
|
||||
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||
free((void *)data);
|
||||
GGML_FREE((void *)data);
|
||||
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||
GGML_ASSERT(false && "nested arrays not supported");
|
||||
} else {
|
||||
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20047,7 +20228,7 @@ struct gguf_buf {
|
||||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
||||
/*buf.size =*/ size,
|
||||
/*buf.offset =*/ 0,
|
||||
};
|
||||
@@ -20057,7 +20238,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
free(buf.data);
|
||||
GGML_FREE(buf.data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20138,7 +20319,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
@@ -20147,10 +20328,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20382,6 +20563,14 @@ int ggml_cpu_has_vulkan(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_kompute(void) {
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sycl(void) {
|
||||
#if defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
@@ -20391,7 +20580,8 @@ int ggml_cpu_has_sycl(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||
ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
||||
@@ -1495,7 +1495,8 @@ extern "C" {
|
||||
int p1,
|
||||
int d0,
|
||||
int d1,
|
||||
bool is_2D);
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
@@ -2266,6 +2267,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_cublas (void);
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_vulkan (void);
|
||||
GGML_API int ggml_cpu_has_kompute (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
|
||||
@@ -19,8 +19,8 @@ shader_int8_ext = """
|
||||
|
||||
# Type-specific defines
|
||||
shader_f16_defines = """
|
||||
#define QUANT_K 32
|
||||
#define QUANT_R 2
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#define A_TYPE float16_t
|
||||
"""
|
||||
@@ -1689,7 +1689,8 @@ void main() {
|
||||
}
|
||||
|
||||
const float xi = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(0.5f*xi*(1.0f + tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))));
|
||||
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
|
||||
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
@@ -72,6 +72,7 @@ class Keys:
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||
HF_JSON = "tokenizer.huggingface.json"
|
||||
RWKV = "tokenizer.rwkv.world"
|
||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||
@@ -102,6 +103,7 @@ class MODEL_ARCH(IntEnum):
|
||||
PLAMO = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -153,6 +155,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -446,6 +449,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.INTERNLM2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -411,6 +411,9 @@ class GGUFWriter:
|
||||
def add_add_eos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||
|
||||
def add_add_space_prefix(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||
|
||||
def add_chat_template(self, value: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ class TensorNameMap:
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
"model.tok_embeddings", # internlm2
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -42,7 +43,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||
"output", # llama-pth bloom
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
),
|
||||
@@ -51,7 +52,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"model.norm", # llama-hf baichuan internlm2
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
@@ -84,6 +85,7 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
"model.layers.{bid}.attention_norm", # internlm2
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -111,6 +113,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||
"model.layers.{bid}.attention.wq" # internlm2
|
||||
),
|
||||
|
||||
# Attention key
|
||||
@@ -120,6 +123,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||
"model.layers.{bid}.attention.wk" # internlm2
|
||||
),
|
||||
|
||||
# Attention value
|
||||
@@ -129,6 +133,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||
"model.layers.{bid}.attention.wv" # internlm2
|
||||
),
|
||||
|
||||
# Attention output
|
||||
@@ -147,6 +152,7 @@ class TensorNameMap:
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
"model.layers.{bid}.attention.wo", # internlm2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -169,6 +175,7 @@ class TensorNameMap:
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
@@ -194,6 +201,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -212,6 +220,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -236,6 +245,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
|
||||
@@ -204,6 +204,7 @@ enum llm_arch {
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -226,6 +227,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
@@ -278,6 +280,7 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_PAD_ID,
|
||||
LLM_KV_TOKENIZER_ADD_BOS,
|
||||
LLM_KV_TOKENIZER_ADD_EOS,
|
||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
};
|
||||
@@ -332,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
};
|
||||
@@ -669,7 +673,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
LLM_ARCH_INTERNLM2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1377,6 +1397,7 @@ enum e_model {
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_40B,
|
||||
@@ -1618,6 +1639,8 @@ struct llama_vocab {
|
||||
id special_suffix_id = 32008;
|
||||
id special_eot_id = 32010;
|
||||
|
||||
bool add_space_prefix = true;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
||||
@@ -2713,10 +2736,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XSS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@@ -2731,6 +2754,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
@@ -2743,6 +2767,14 @@ static const char * llama_model_type_name(e_model type) {
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||
model.arch = ml.get_arch();
|
||||
@@ -3006,6 +3038,15 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 48: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3057,6 +3098,11 @@ static void llm_load_vocab(
|
||||
vocab.special_unk_id = 0;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
} // The default value of add_space_prefix is true.
|
||||
} else if (tokenizer_name == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
@@ -3269,7 +3315,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
@@ -4018,8 +4064,35 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -4666,126 +4739,6 @@ struct llm_build_context {
|
||||
ctx0 = nullptr;
|
||||
}
|
||||
}
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
// if (model.layers[il].bq) {
|
||||
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
// cb(Qcur, "Qcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
// if (model.layers[il].bk) {
|
||||
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
// cb(Kcur, "Kcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
// if (model.layers[il].bv) {
|
||||
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
// cb(Vcur, "Vcur", il);
|
||||
// }
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
@@ -6589,6 +6542,245 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
// if (model.layers[il].bq) {
|
||||
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
// cb(Qcur, "Qcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
// if (model.layers[il].bk) {
|
||||
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
// cb(Kcur, "Kcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
// if (model.layers[il].bv) {
|
||||
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
// cb(Vcur, "Vcur", il);
|
||||
// }
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_internlm2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -6747,6 +6939,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_orion();
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
result = llm.build_internlm2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -6878,11 +7074,6 @@ static int llama_decode_internal(
|
||||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
||||
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
@@ -7694,7 +7885,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
//
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
if (&fragment == &fragment_buffer.front()) {
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
if (vocab.add_space_prefix) {
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
@@ -10095,18 +10288,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_max_devices(void) {
|
||||
return LLAMA_MAX_DEVICES;
|
||||
size_t llama_max_devices(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
return GGML_CUDA_MAX_DEVICES;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return GGML_SYCL_MAX_DEVICES;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool llama_mmap_supported(void) {
|
||||
bool llama_supports_mmap(void) {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
|
||||
bool llama_mlock_supported(void) {
|
||||
bool llama_supports_mlock(void) {
|
||||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// deprecated:
|
||||
bool llama_mmap_supported(void) {
|
||||
return llama_supports_mmap();
|
||||
}
|
||||
|
||||
bool llama_mlock_supported(void) {
|
||||
return llama_supports_mlock();
|
||||
}
|
||||
|
||||
void llama_backend_init(bool numa) {
|
||||
ggml_time_init();
|
||||
|
||||
@@ -10138,8 +10358,8 @@ int64_t llama_time_us(void) {
|
||||
}
|
||||
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
ggml_time_init();
|
||||
|
||||
llama_model * model = new llama_model;
|
||||
@@ -11157,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
|
||||
};
|
||||
}
|
||||
|
||||
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
||||
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
||||
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
} else {
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
|
||||
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
||||
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
||||
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
||||
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
||||
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
||||
for (int i = 0; i < n_tokens_alloc; ++i) {
|
||||
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
||||
}
|
||||
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
||||
batch.seq_id[n_tokens_alloc] = nullptr;
|
||||
|
||||
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
||||
|
||||
return batch;
|
||||
}
|
||||
@@ -11183,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
|
||||
if (batch.pos) free(batch.pos);
|
||||
if (batch.n_seq_id) free(batch.n_seq_id);
|
||||
if (batch.seq_id) {
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
||||
free(batch.seq_id[i]);
|
||||
}
|
||||
free(batch.seq_id);
|
||||
|
||||
@@ -3,15 +3,7 @@
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
@@ -49,12 +41,6 @@
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -201,7 +187,7 @@ extern "C" {
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
@@ -338,9 +324,14 @@ extern "C" {
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API int32_t llama_max_devices(void);
|
||||
LLAMA_API bool llama_mmap_supported (void);
|
||||
LLAMA_API bool llama_mlock_supported(void);
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
|
||||
LLAMA_API bool llama_supports_mmap (void);
|
||||
LLAMA_API bool llama_supports_mlock (void);
|
||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||
|
||||
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
||||
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
||||
|
||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
|
||||
set URL=%1
|
||||
set COMPONENTS=%2
|
||||
|
||||
curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
|
||||
start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
|
||||
del %TEMP%\webimage.exe
|
||||
if "%COMPONENTS%"=="" (
|
||||
webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
|
||||
) else (
|
||||
webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
|
||||
)
|
||||
set installer_exit_code=%ERRORLEVEL%
|
||||
rd /s/q "webimage_extracted"
|
||||
exit /b %installer_exit_code%
|
||||
@@ -1 +1 @@
|
||||
f2a9472b23cf27e672ed70a2a6eb078f7b060f18
|
||||
475cbad5c1c834e31e26a2283bc1413181644360
|
||||
|
||||
@@ -227,6 +227,14 @@ static std::string var_to_str(ggml_type type) {
|
||||
return ggml_type_name(type);
|
||||
}
|
||||
|
||||
static std::string var_to_str(ggml_op_pool pool) {
|
||||
switch (pool) {
|
||||
case GGML_OP_POOL_AVG: return "avg";
|
||||
case GGML_OP_POOL_MAX: return "max";
|
||||
default: return std::to_string(pool);
|
||||
}
|
||||
}
|
||||
|
||||
#define VARS_TO_STR1(a) VAR_TO_STR(a)
|
||||
#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
|
||||
#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
|
||||
@@ -238,6 +246,7 @@ static std::string var_to_str(ggml_type type) {
|
||||
#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
|
||||
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
||||
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
||||
#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
static bool inline _isinf(float f) {
|
||||
@@ -1162,10 +1171,45 @@ struct test_alibi : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_POOL2D
|
||||
struct test_pool2d : public test_case {
|
||||
enum ggml_op_pool pool_type;
|
||||
const ggml_type type_input;
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
// kernel size
|
||||
const int k0;
|
||||
const int k1;
|
||||
// stride
|
||||
const int s0;
|
||||
const int s1;
|
||||
// padding
|
||||
const int p0;
|
||||
const int p1;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
|
||||
}
|
||||
|
||||
test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
|
||||
ggml_type type_input = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
int k0 = 3, int k1 = 3,
|
||||
int s0 = 1, int s1 = 1,
|
||||
int p0 = 1, int p1 = 1)
|
||||
: pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
||||
ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_IM2COL
|
||||
struct test_im2col : public test_case {
|
||||
const ggml_type type_input;
|
||||
const ggml_type type_kernel;
|
||||
const ggml_type dst_type;
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
// stride
|
||||
@@ -1181,22 +1225,22 @@ struct test_im2col : public test_case {
|
||||
const bool is_2D;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
}
|
||||
|
||||
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
|
||||
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
int s0 = 1, int s1 = 1,
|
||||
int p0 = 1, int p1 = 1,
|
||||
int d0 = 1, int d1 = 1,
|
||||
bool is_2D = true)
|
||||
: type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
||||
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
||||
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@@ -1912,6 +1956,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
||||
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
||||
for (int k0 : {1, 3}) {
|
||||
for (int k1 : {1, 3}) {
|
||||
for (int s0 : {1, 2}) {
|
||||
for (int s1 : {1, 2}) {
|
||||
for (int p0 : {0, 1}) {
|
||||
for (int p1 : {0, 1}) {
|
||||
test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
||||
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
|
||||
@@ -1927,8 +1992,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
|
||||
for (ggml_type type : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
@@ -2047,7 +2114,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_alibi());
|
||||
test_cases.emplace_back(new test_im2col());
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user