mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 16:17:40 +02:00
Compare commits
81 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5e6dad9322 | |||
| 33bdee667e | |||
| af148c9386 | |||
| 66798e42fb | |||
| fb4a0ec083 | |||
| 5ea926dad7 | |||
| 1ee9eea094 | |||
| ff7fb670d0 | |||
| 0e712a5acb | |||
| a0ec17b32e | |||
| 2e82ffa4af | |||
| 80dd7ff22f | |||
| 54ef9cfc72 | |||
| b0cefea58a | |||
| b141e5f6ef | |||
| 4b3a9212b6 | |||
| 505f33274d | |||
| 160687b3ed | |||
| 6423c65aa8 | |||
| 39a334a9aa | |||
| bb38cdd8ba | |||
| f018acba22 | |||
| 46323fa9ef | |||
| 5b359bb1e3 | |||
| e89213492d | |||
| 8fc393f246 | |||
| ec450d3bbf | |||
| 695ad752b2 | |||
| 841f27abdb | |||
| d05b3127bd | |||
| 76c6e7f105 | |||
| a71d81cf8c | |||
| eec4d71737 | |||
| 3b08828674 | |||
| a2c6fd747c | |||
| 97404c4a03 | |||
| 60e17ce23c | |||
| 5107e8cea3 | |||
| 2319126a70 | |||
| 3bcd40b3c5 | |||
| 5c333e0140 | |||
| b11f9ba9b8 | |||
| 94d8cb8be1 | |||
| 1dc04b2dee | |||
| a1eaf6a960 | |||
| b8deef0ec0 | |||
| a9e8a9a030 | |||
| 3407364776 | |||
| d5a409e57f | |||
| 401558b7ba | |||
| 9e0ecfb697 | |||
| 6a066b9978 | |||
| ea02c753eb | |||
| 05697f670b | |||
| f8e58135cf | |||
| 329ed914c9 | |||
| ce027adfb3 | |||
| 284e5b0275 | |||
| e2292aaa17 | |||
| 9f40989351 | |||
| 08828a6d7d | |||
| 1839f69130 | |||
| 9830b6923b | |||
| 42cadc74bd | |||
| 45950415ed | |||
| 1926d6e39d | |||
| b634f8a26f | |||
| 7554aa4655 | |||
| a6744e43e8 | |||
| e991e3127f | |||
| 418f5eef26 | |||
| ba6f62eb79 | |||
| d865d1478c | |||
| 1804adb0cf | |||
| 815fe72adc | |||
| f221d56220 | |||
| e597e50794 | |||
| 85679d37f3 | |||
| 1e9f94994e | |||
| c02e5ab2a6 | |||
| ab3d71f97f |
@@ -24,6 +24,16 @@ insert_final_newline = unset
|
||||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
|
||||
@@ -55,7 +55,13 @@ jobs:
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||
cmake .. \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_RPC=ON \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
@@ -92,7 +98,7 @@ jobs:
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-12
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -113,7 +119,12 @@ jobs:
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
@@ -569,6 +580,7 @@ jobs:
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -599,6 +611,7 @@ jobs:
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
@@ -48,10 +48,23 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-apple-clang", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
@@ -34,6 +33,7 @@ BUILD_TARGETS = \
|
||||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-simple-chat \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
@@ -55,7 +55,6 @@ TEST_TARGETS = \
|
||||
tests/test-llama-grammar \
|
||||
tests/test-log \
|
||||
tests/test-model-load-cancel \
|
||||
tests/test-opt \
|
||||
tests/test-quantize-fns \
|
||||
tests/test-quantize-perf \
|
||||
tests/test-rope \
|
||||
@@ -63,6 +62,7 @@ TEST_TARGETS = \
|
||||
tests/test-tokenizer-0 \
|
||||
tests/test-tokenizer-1-bpe \
|
||||
tests/test-tokenizer-1-spm
|
||||
# tests/test-opt \
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||
@@ -878,6 +878,10 @@ ifdef GGML_METAL
|
||||
MK_CPPFLAGS += -DGGML_USE_METAL
|
||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
OBJ_GGML += ggml/src/ggml-metal.o
|
||||
|
||||
ifdef GGML_METAL_USE_BF16
|
||||
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
|
||||
endif # GGML_METAL_USE_BF16
|
||||
ifdef GGML_METAL_NDEBUG
|
||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||
endif
|
||||
@@ -915,6 +919,7 @@ endif # GGML_METAL
|
||||
|
||||
OBJ_GGML += \
|
||||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-cpu.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-quants.o \
|
||||
@@ -935,7 +940,6 @@ OBJ_COMMON = \
|
||||
common/console.o \
|
||||
common/ngram-cache.o \
|
||||
common/sampling.o \
|
||||
common/train.o \
|
||||
common/build-info.o \
|
||||
common/json-schema-to-grammar.o
|
||||
|
||||
@@ -1047,6 +1051,12 @@ ggml/src/ggml.o: \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-cpu.o: \
|
||||
ggml/src/ggml-cpu.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-alloc.o: \
|
||||
ggml/src/ggml-alloc.c \
|
||||
ggml/include/ggml.h \
|
||||
@@ -1212,11 +1222,6 @@ common/json-schema-to-grammar.o: \
|
||||
common/json-schema-to-grammar.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/train.o: \
|
||||
common/train.cpp \
|
||||
common/train.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/ngram-cache.o: \
|
||||
common/ngram-cache.cpp \
|
||||
common/ngram-cache.h
|
||||
@@ -1287,6 +1292,11 @@ llama-simple: examples/simple/simple.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
@@ -1384,11 +1394,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
@@ -1454,22 +1459,13 @@ llama-server: \
|
||||
examples/server/server.cpp \
|
||||
examples/server/utils.hpp \
|
||||
examples/server/httplib.h \
|
||||
examples/server/colorthemes.css.hpp \
|
||||
examples/server/style.css.hpp \
|
||||
examples/server/theme-beeninorder.css.hpp \
|
||||
examples/server/theme-ketivah.css.hpp \
|
||||
examples/server/theme-mangotango.css.hpp \
|
||||
examples/server/theme-playground.css.hpp \
|
||||
examples/server/theme-polarnight.css.hpp \
|
||||
examples/server/theme-snowstorm.css.hpp \
|
||||
examples/server/index.html.hpp \
|
||||
examples/server/index-new.html.hpp \
|
||||
examples/server/index.js.hpp \
|
||||
examples/server/completion.js.hpp \
|
||||
examples/server/system-prompts.js.hpp \
|
||||
examples/server/prompt-formats.js.hpp \
|
||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
examples/server/deps_daisyui.min.css.hpp \
|
||||
examples/server/deps_markdown-it.js.hpp \
|
||||
examples/server/deps_tailwindcss.js.hpp \
|
||||
examples/server/deps_vue.esm-browser.js.hpp \
|
||||
common/json.hpp \
|
||||
common/stb_image.h \
|
||||
$(OBJ_ALL)
|
||||
|
||||
+4
-1
@@ -10,6 +10,7 @@ var sources = [
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-cpu.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-quants.c",
|
||||
@@ -60,13 +61,15 @@ let package = Package(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"build",
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile"
|
||||
"Makefile",
|
||||
"ggml/src/ggml-metal-embed.metal"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
|
||||
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
||||
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
@@ -130,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
|
||||
@@ -39,7 +39,7 @@ SRC=`pwd`
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -460,34 +460,34 @@ function gg_run_pythia_1_4b {
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -591,36 +591,36 @@ function gg_run_pythia_2_8b {
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -706,8 +706,8 @@ function gg_run_embd_bge_small {
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
set( CMAKE_SYSTEM_NAME Darwin )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-apple-darwin-macho )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
@@ -66,8 +66,6 @@ add_library(${TARGET} STATIC
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
train.cpp
|
||||
train.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
||||
@@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "bf16") {
|
||||
return GGML_TYPE_BF16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
@@ -1951,6 +1954,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||
|
||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
const auto & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
|
||||
+2
-2
@@ -155,7 +155,7 @@ struct common_sampler_params {
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
@@ -178,7 +178,7 @@ struct common_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
-1515
File diff suppressed because it is too large
Load Diff
-233
@@ -1,233 +0,0 @@
|
||||
// Various helper functions and utilities for training
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
||||
|
||||
typedef std::string mt19937_state;
|
||||
|
||||
struct train_state {
|
||||
struct ggml_opt_context * opt;
|
||||
|
||||
uint64_t train_its;
|
||||
uint64_t train_samples;
|
||||
uint64_t train_tokens;
|
||||
uint64_t train_epochs;
|
||||
|
||||
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
||||
mt19937_state shuffle_rng_state_current;
|
||||
mt19937_state shuffle_rng_state_next;
|
||||
size_t shuffle_sample_count;
|
||||
size_t shuffle_next_sample;
|
||||
};
|
||||
|
||||
struct train_params_common {
|
||||
const char * fn_train_data;
|
||||
const char * fn_checkpoint_in;
|
||||
const char * fn_checkpoint_out;
|
||||
const char * pattern_fn_it;
|
||||
const char * fn_latest;
|
||||
|
||||
bool print_usage;
|
||||
|
||||
int save_every;
|
||||
|
||||
uint32_t seed;
|
||||
|
||||
int n_ctx;
|
||||
int n_threads;
|
||||
int n_batch;
|
||||
int n_gradient_accumulation;
|
||||
int n_epochs;
|
||||
int n_gpu_layers;
|
||||
|
||||
bool custom_n_ctx;
|
||||
|
||||
bool use_flash;
|
||||
bool use_checkpointing;
|
||||
|
||||
std::string sample_start;
|
||||
bool include_sample_start;
|
||||
bool escape;
|
||||
bool overlapping_samples;
|
||||
bool fill_with_next_samples;
|
||||
bool separate_with_eos;
|
||||
bool separate_with_bos;
|
||||
bool sample_random_offsets;
|
||||
|
||||
bool force_reshuffle;
|
||||
|
||||
int warmup;
|
||||
int cos_decay_steps;
|
||||
float cos_decay_restart;
|
||||
float cos_decay_min;
|
||||
bool enable_restart;
|
||||
|
||||
int opt_past;
|
||||
float opt_delta;
|
||||
int opt_max_no_improvement;
|
||||
|
||||
int adam_n_iter;
|
||||
float adam_alpha;
|
||||
float adam_min_alpha;
|
||||
float adam_decay;
|
||||
int adam_decay_min_ndim;
|
||||
float adam_beta1;
|
||||
float adam_beta2;
|
||||
float adam_gclip;
|
||||
float adam_eps_f;
|
||||
};
|
||||
|
||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
||||
|
||||
struct train_opt_callback_data {
|
||||
struct train_params_common * params;
|
||||
struct train_state * train;
|
||||
save_train_files_callback save_cb;
|
||||
void * save_data;
|
||||
struct llama_context * lctx;
|
||||
int last_save_iter;
|
||||
llama_token * tokens_data;
|
||||
size_t tokens_size;
|
||||
size_t * samples_begin;
|
||||
size_t * samples_size;
|
||||
size_t * shuffled_samples_offs;
|
||||
size_t * shuffled_samples_begin;
|
||||
size_t * shuffled_samples_size;
|
||||
size_t samples_count;
|
||||
struct ggml_tensor * tokens_input;
|
||||
struct ggml_tensor * target_probs;
|
||||
int first_iter;
|
||||
int first_epoch;
|
||||
int iter_at_last_epoch;
|
||||
int64_t last_time;
|
||||
double millis_per_iter;
|
||||
};
|
||||
|
||||
struct train_state * init_train_state();
|
||||
void free_train_state(struct train_state * state);
|
||||
|
||||
struct train_params_common get_default_train_params_common();
|
||||
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
||||
|
||||
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
||||
void finish_processing_train_args(struct train_params_common * params);
|
||||
|
||||
struct random_normal_distribution;
|
||||
struct random_uniform_distribution;
|
||||
|
||||
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
||||
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
||||
|
||||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
||||
|
||||
// generate random float in interval [0,1)
|
||||
float frand();
|
||||
float frand_normal (struct random_normal_distribution * rnd);
|
||||
float frand_uniform(struct random_uniform_distribution * rnd);
|
||||
|
||||
int clamp (const int v, const int min, const int max);
|
||||
float fclamp(const float v, const float min, const float max);
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||
|
||||
size_t tokenize_file(
|
||||
struct llama_context * lctx,
|
||||
const char * filename,
|
||||
const std::string & sample_start,
|
||||
bool include_sample_start,
|
||||
bool overlapping_samples,
|
||||
unsigned context_length,
|
||||
std::vector<llama_token> & out_tokens,
|
||||
std::vector<size_t> & out_samples_begin,
|
||||
std::vector<size_t> & out_samples_size);
|
||||
|
||||
int64_t get_example_targets_batch(
|
||||
struct llama_context * lctx,
|
||||
struct ggml_tensor * tokens_input,
|
||||
struct ggml_tensor * target_probs,
|
||||
int64_t example_id,
|
||||
const size_t * samples_offs,
|
||||
const size_t * samples_begin,
|
||||
const size_t * samples_size,
|
||||
size_t samples_count,
|
||||
const llama_token * train_data,
|
||||
size_t n_train_data,
|
||||
bool separate_with_eos,
|
||||
bool separate_with_bos,
|
||||
bool fill_with_next_samples,
|
||||
bool sample_random_offsets);
|
||||
|
||||
|
||||
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
||||
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
||||
mt19937_state mt19937_seed_to_state(unsigned seed);
|
||||
|
||||
mt19937_state shuffle_samples(
|
||||
const mt19937_state & rng_state,
|
||||
size_t * shuffled_offs,
|
||||
size_t * shuffled_begins,
|
||||
size_t * shuffled_sizes,
|
||||
const size_t * begins,
|
||||
const size_t * sizes,
|
||||
size_t count);
|
||||
|
||||
size_t hash_combine(size_t h1, size_t h2);
|
||||
|
||||
size_t compute_samples_hash(
|
||||
const char* fn,
|
||||
const size_t* samples_begin,
|
||||
const size_t* samples_size,
|
||||
size_t sample_count);
|
||||
|
||||
|
||||
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
||||
|
||||
void print_duration(double milliseconds);
|
||||
|
||||
float cosine_decay(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum);
|
||||
|
||||
float cosine_decay_restart(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum,
|
||||
float restart_step_mult);
|
||||
|
||||
float learning_schedule(
|
||||
int64_t step,
|
||||
int64_t warmup_steps,
|
||||
int64_t decay_steps,
|
||||
float learning_rate,
|
||||
float overall_minimum,
|
||||
float cos_decay_minimum,
|
||||
float cos_decay_restart_step_mult,
|
||||
bool enable_restart);
|
||||
|
||||
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
||||
|
||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
||||
|
||||
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
||||
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
||||
|
||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
||||
+14
-19
@@ -72,7 +72,8 @@ class Model:
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
|
||||
@@ -87,7 +88,7 @@ class Model:
|
||||
self.is_safetensors = len(self.part_names) > 0
|
||||
if not self.is_safetensors:
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self.tensor_names = None
|
||||
@@ -1541,6 +1542,17 @@ class LlamaModel(Model):
|
||||
special_vocab._set_special_token("eot", 32010)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@@ -1557,17 +1569,6 @@ class LlamaModel(Model):
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
@@ -3747,10 +3748,7 @@ class JaisModel(Model):
|
||||
|
||||
# Embeddings scale
|
||||
self.embeddings_scale = 1.0
|
||||
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
||||
self.output_is_wte = False
|
||||
if 'mup_embeddings_scale' in self.hparams:
|
||||
self.output_is_wte = True # Hack (?)
|
||||
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
||||
elif 'embeddings_scale' in self.hparams:
|
||||
self.embeddings_scale = self.hparams['embeddings_scale']
|
||||
@@ -3807,10 +3805,7 @@ class JaisModel(Model):
|
||||
|
||||
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
||||
tensors.append((new_name, data_torch * self.embeddings_scale))
|
||||
if self.output_is_wte:
|
||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
||||
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
||||
assert not self.output_is_wte
|
||||
tensors.append((new_name, data_torch * self.width_scale))
|
||||
else:
|
||||
tensors.append((new_name, data_torch))
|
||||
|
||||
+37
-10
@@ -12,6 +12,7 @@ import json
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||
from transformers import AutoConfig
|
||||
|
||||
import torch
|
||||
|
||||
@@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
|
||||
help="only print out what will be done, without writing any new files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base", type=Path, required=True,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
|
||||
"--base", type=Path,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
@@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||
config = AutoConfig.from_pretrained(hf_model_id)
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
@@ -281,7 +288,7 @@ if __name__ == '__main__':
|
||||
|
||||
ftype = ftype_map[args.outtype]
|
||||
|
||||
dir_base_model: Path = args.base
|
||||
dir_base_model: Path | None = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
@@ -301,9 +308,29 @@ if __name__ == '__main__':
|
||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||
|
||||
# load LoRA config
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
# load base model
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
if dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
try:
|
||||
hparams = load_hparams_from_hf(model_id)
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to load base model config: {e}")
|
||||
logger.error("Please try downloading the base model and add its path to --base")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
|
||||
logger.error("Base model config is required. Please download the base model and add its path to --base")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
@@ -323,13 +350,15 @@ if __name__ == '__main__':
|
||||
self.dir_model_card = dir_lora_model
|
||||
self.lora_alpha = float(lora_alpha)
|
||||
|
||||
def set_vocab(self):
|
||||
pass
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||
super().set_gguf_parameters()
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||
@@ -350,7 +379,7 @@ if __name__ == '__main__':
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
|
||||
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||
sys.exit(1)
|
||||
|
||||
if base_name in tensor_map:
|
||||
@@ -384,9 +413,6 @@ if __name__ == '__main__':
|
||||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
alpha: float = lparams["lora_alpha"]
|
||||
|
||||
model_instance = LoraModel(
|
||||
@@ -399,6 +425,7 @@ if __name__ == '__main__':
|
||||
dry_run=args.dry_run,
|
||||
dir_lora_model=dir_lora,
|
||||
lora_alpha=alpha,
|
||||
hparams=hparams,
|
||||
)
|
||||
|
||||
logger.info("Exporting model...")
|
||||
|
||||
@@ -377,7 +377,7 @@ found 2 SYCL devices:
|
||||
|
||||
|Chosen Device ID|Setting|
|
||||
|-|-|
|
||||
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|
||||
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
|
||||
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
@@ -49,6 +48,7 @@ else()
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
endif()
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
|
||||
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
|
||||
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
|
||||
|
||||
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
|
||||
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
|
||||
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
|
||||
'|'\
|
||||
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
|
||||
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
|
||||
|
||||
CTX_SIZE=2048
|
||||
@@ -129,15 +130,12 @@ while read -e line; do
|
||||
|
||||
printf ' '
|
||||
|
||||
# HACK get num tokens from debug message
|
||||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
|
||||
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
|
||||
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
|
||||
|
||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
|
||||
|
||||
@@ -840,6 +840,8 @@ class OutputFile:
|
||||
self.gguf.add_base_model_version(key, base_model_entry["version"])
|
||||
if "organization" in base_model_entry:
|
||||
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
|
||||
if "description" in base_model_entry:
|
||||
self.gguf.add_base_model_description(key, base_model_entry["description"])
|
||||
if "url" in base_model_entry:
|
||||
self.gguf.add_base_model_url(key, base_model_entry["url"])
|
||||
if "doi" in base_model_entry:
|
||||
@@ -849,12 +851,32 @@ class OutputFile:
|
||||
if "repo_url" in base_model_entry:
|
||||
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
||||
|
||||
if metadata.datasets is not None:
|
||||
self.gguf.add_dataset_count(len(metadata.datasets))
|
||||
for key, dataset_entry in enumerate(metadata.datasets):
|
||||
if "name" in dataset_entry:
|
||||
self.gguf.add_dataset_name(key, dataset_entry["name"])
|
||||
if "author" in dataset_entry:
|
||||
self.gguf.add_dataset_author(key, dataset_entry["author"])
|
||||
if "version" in dataset_entry:
|
||||
self.gguf.add_dataset_version(key, dataset_entry["version"])
|
||||
if "organization" in dataset_entry:
|
||||
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
|
||||
if "description" in dataset_entry:
|
||||
self.gguf.add_dataset_description(key, dataset_entry["description"])
|
||||
if "url" in dataset_entry:
|
||||
self.gguf.add_dataset_url(key, dataset_entry["url"])
|
||||
if "doi" in dataset_entry:
|
||||
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
|
||||
if "uuid" in dataset_entry:
|
||||
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
|
||||
if "repo_url" in dataset_entry:
|
||||
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
|
||||
|
||||
if metadata.tags is not None:
|
||||
self.gguf.add_tags(metadata.tags)
|
||||
if metadata.languages is not None:
|
||||
self.gguf.add_languages(metadata.languages)
|
||||
if metadata.datasets is not None:
|
||||
self.gguf.add_datasets(metadata.datasets)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
# Metadata About The Neural Architecture Itself
|
||||
|
||||
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "bf16") {
|
||||
return GGML_TYPE_BF16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
@@ -15,22 +15,13 @@ set(TARGET_SRCS
|
||||
httplib.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
colorthemes.css
|
||||
style.css
|
||||
theme-beeninorder.css
|
||||
theme-ketivah.css
|
||||
theme-mangotango.css
|
||||
theme-playground.css
|
||||
theme-polarnight.css
|
||||
theme-snowstorm.css
|
||||
index.html
|
||||
index-new.html
|
||||
index.js
|
||||
completion.js
|
||||
system-prompts.js
|
||||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
loading.html
|
||||
deps_daisyui.min.css
|
||||
deps_markdown-it.js
|
||||
deps_tailwindcss.js
|
||||
deps_vue.esm-browser.js
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
|
||||
+31
-16
@@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
@@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
|
||||
@@ -99,25 +99,27 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-base N` | DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||
@@ -381,6 +383,10 @@ node index.js
|
||||
|
||||
`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
|
||||
|
||||
`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
|
||||
|
||||
`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
|
||||
|
||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
|
||||
|
||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
|
||||
@@ -409,7 +415,7 @@ node index.js
|
||||
|
||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
||||
|
||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||
|
||||
**Response format**
|
||||
|
||||
@@ -692,7 +698,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
|
||||
|
||||
### GET `/slots`: Returns the current slots processing state
|
||||
|
||||
This endpoint can be disabled with `--no-slots`
|
||||
> [!WARNING]
|
||||
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
|
||||
|
||||
This endpoint is disabled by default and can be enabled with `--slots`
|
||||
|
||||
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
|
||||
|
||||
@@ -709,6 +718,7 @@ Example:
|
||||
"grammar": "",
|
||||
"id": 0,
|
||||
"ignore_eos": false,
|
||||
"is_processing": false,
|
||||
"logit_bias": [],
|
||||
"min_p": 0.05000000074505806,
|
||||
"mirostat": 0,
|
||||
@@ -741,7 +751,6 @@ Example:
|
||||
"temperature"
|
||||
],
|
||||
"seed": 42,
|
||||
"state": 1,
|
||||
"stop": [
|
||||
"\n"
|
||||
],
|
||||
@@ -755,10 +764,6 @@ Example:
|
||||
]
|
||||
```
|
||||
|
||||
Possible values for `slot[i].state` are:
|
||||
- `0`: SLOT_STATE_IDLE
|
||||
- `1`: SLOT_STATE_PROCESSING
|
||||
|
||||
### GET `/metrics`: Prometheus compatible metrics exporter
|
||||
|
||||
This endpoint is only accessible if `--metrics` is set.
|
||||
@@ -929,6 +934,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
|
||||
}
|
||||
```
|
||||
|
||||
### Legacy completion web UI
|
||||
|
||||
A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
|
||||
|
||||
For example:
|
||||
|
||||
```sh
|
||||
./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
|
||||
```
|
||||
|
||||
### Extending or building alternative Web Front End
|
||||
|
||||
You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import * as readline from 'node:readline'
|
||||
import { stdin, stdout } from 'node:process'
|
||||
import { readFileSync } from 'node:fs'
|
||||
import { SchemaConverter } from './public/json-schema-to-grammar.mjs'
|
||||
import { SchemaConverter } from './public_legacy/json-schema-to-grammar.mjs'
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const grammarJsonSchemaFile = args.find(
|
||||
|
||||
+17
-2
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
PUBLIC=$DIR/public
|
||||
|
||||
echo "download js bundle files"
|
||||
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
|
||||
echo >> $PUBLIC/index.js # add newline
|
||||
|
||||
# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
|
||||
|
||||
curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
|
||||
echo >> $PUBLIC/deps_tailwindcss.js # add newline
|
||||
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
|
||||
echo >> $PUBLIC/deps_daisyui.min.css # add newline
|
||||
|
||||
curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
|
||||
echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
|
||||
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
|
||||
echo >> $PUBLIC/deps_markdown-it.js # add newline
|
||||
|
||||
ls -lah $PUBLIC
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
const paramDefaults = {
|
||||
stream: true,
|
||||
n_predict: 500,
|
||||
temperature: 0.2,
|
||||
stop: ["</s>"]
|
||||
};
|
||||
|
||||
let generation_settings = null;
|
||||
|
||||
export class CompletionError extends Error {
|
||||
constructor(message, name, data) {
|
||||
super(message);
|
||||
this.name = name;
|
||||
}
|
||||
};
|
||||
|
||||
// Completes the prompt as a generator. Recommended for most use cases.
|
||||
//
|
||||
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
|
||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||
|
||||
const response = await fetch(`${api_url}/completion`, {
|
||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(completionParams),
|
||||
headers: {
|
||||
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const status = response.status;
|
||||
if (status !== 200) {
|
||||
try {
|
||||
const body = await response.json();
|
||||
if (body && body.error && body.error.message) {
|
||||
throw new CompletionError(body.error.message, 'ServerError');
|
||||
}
|
||||
} catch (err) {
|
||||
throw new CompletionError(err.message, 'ServerError');
|
||||
}
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2]
|
||||
result[match[1]] = match[2];
|
||||
if (result.data === '[DONE]') {
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
|
||||
+13
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
+654
-1285
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,209 @@
|
||||
const paramDefaults = {
|
||||
stream: true,
|
||||
n_predict: 500,
|
||||
temperature: 0.2,
|
||||
stop: ["</s>"]
|
||||
};
|
||||
|
||||
let generation_settings = null;
|
||||
|
||||
|
||||
// Completes the prompt as a generator. Recommended for most use cases.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llama } from '/completion.js'
|
||||
//
|
||||
// const request = llama("Tell me a joke", {n_predict: 800})
|
||||
// for await (const chunk of request) {
|
||||
// document.write(chunk.data.content)
|
||||
// }
|
||||
//
|
||||
export async function* llama(prompt, params = {}, config = {}) {
|
||||
let controller = config.controller;
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
|
||||
if (!controller) {
|
||||
controller = new AbortController();
|
||||
}
|
||||
|
||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||
|
||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(completionParams),
|
||||
headers: {
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'text/event-stream',
|
||||
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
let content = "";
|
||||
let leftover = ""; // Buffer for partially read lines
|
||||
|
||||
try {
|
||||
let cont = true;
|
||||
|
||||
while (cont) {
|
||||
const result = await reader.read();
|
||||
if (result.done) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Add any leftover data to the current chunk of data
|
||||
const text = leftover + decoder.decode(result.value);
|
||||
|
||||
// Check if the last character is a line break
|
||||
const endsWithLineBreak = text.endsWith('\n');
|
||||
|
||||
// Split the text into lines
|
||||
let lines = text.split('\n');
|
||||
|
||||
// If the text doesn't end with a line break, then the last line is incomplete
|
||||
// Store it in leftover to be added to the next chunk of data
|
||||
if (!endsWithLineBreak) {
|
||||
leftover = lines.pop();
|
||||
} else {
|
||||
leftover = ""; // Reset leftover if we have a line break at the end
|
||||
}
|
||||
|
||||
// Parse all sse events and add them to result
|
||||
const regex = /^(\S+):\s(.*)$/gm;
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2];
|
||||
if (result.data === '[DONE]') {
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
content += result.data.content;
|
||||
|
||||
// yield
|
||||
yield result;
|
||||
|
||||
// if we got a stop token from server, we will break here
|
||||
if (result.data.stop) {
|
||||
if (result.data.generation_settings) {
|
||||
generation_settings = result.data.generation_settings;
|
||||
}
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (result.error) {
|
||||
try {
|
||||
result.error = JSON.parse(result.error);
|
||||
if (result.error.message.includes('slot unavailable')) {
|
||||
// Throw an error to be caught by upstream callers
|
||||
throw new Error('slot unavailable');
|
||||
} else {
|
||||
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
|
||||
}
|
||||
} catch(e) {
|
||||
console.error(`llama.cpp error ${result.error}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
if (e.name !== 'AbortError') {
|
||||
console.error("llama error: ", e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
finally {
|
||||
controller.abort();
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
// Call llama, return an event target that you can subscribe to
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llamaEventTarget } from '/completion.js'
|
||||
//
|
||||
// const conn = llamaEventTarget(prompt)
|
||||
// conn.addEventListener("message", (chunk) => {
|
||||
// document.write(chunk.detail.content)
|
||||
// })
|
||||
//
|
||||
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
|
||||
const eventTarget = new EventTarget();
|
||||
(async () => {
|
||||
let content = "";
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
if (chunk.data) {
|
||||
content += chunk.data.content;
|
||||
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
|
||||
}
|
||||
if (chunk.data.generation_settings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
|
||||
}
|
||||
if (chunk.data.timings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
|
||||
}
|
||||
}
|
||||
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
|
||||
})();
|
||||
return eventTarget;
|
||||
}
|
||||
|
||||
// Call llama, return a promise that resolves to the completed text. This does not support streaming
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// llamaPromise(prompt).then((content) => {
|
||||
// document.write(content)
|
||||
// })
|
||||
//
|
||||
// or
|
||||
//
|
||||
// const content = await llamaPromise(prompt)
|
||||
// document.write(content)
|
||||
//
|
||||
export const llamaPromise = (prompt, params = {}, config = {}) => {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
let content = "";
|
||||
try {
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
content += chunk.data.content;
|
||||
}
|
||||
resolve(content);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* (deprecated)
|
||||
*/
|
||||
export const llamaComplete = async (params, controller, callback) => {
|
||||
for await (const chunk of llama(params.prompt, params, { controller })) {
|
||||
callback(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||
export const llamaModelInfo = async (config = {}) => {
|
||||
if (!generation_settings) {
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
||||
generation_settings = props.default_generation_settings;
|
||||
}
|
||||
return generation_settings;
|
||||
}
|
||||
|
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.0 KiB |
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="5">
|
||||
</head>
|
||||
<body>
|
||||
<div id="loading">
|
||||
The model is loading. Please wait.<br/>
|
||||
The user interface will appear soon.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
+86
-124
@@ -14,22 +14,13 @@
|
||||
#define MIMETYPE_JSON "application/json; charset=utf-8"
|
||||
|
||||
// auto generated files (update with ./deps.sh)
|
||||
#include "colorthemes.css.hpp"
|
||||
#include "style.css.hpp"
|
||||
#include "theme-beeninorder.css.hpp"
|
||||
#include "theme-ketivah.css.hpp"
|
||||
#include "theme-mangotango.css.hpp"
|
||||
#include "theme-playground.css.hpp"
|
||||
#include "theme-polarnight.css.hpp"
|
||||
#include "theme-snowstorm.css.hpp"
|
||||
#include "index.html.hpp"
|
||||
#include "index-new.html.hpp"
|
||||
#include "index.js.hpp"
|
||||
#include "completion.js.hpp"
|
||||
#include "system-prompts.js.hpp"
|
||||
#include "prompt-formats.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
#include "loading.html.hpp"
|
||||
#include "deps_daisyui.min.css.hpp"
|
||||
#include "deps_markdown-it.js.hpp"
|
||||
#include "deps_tailwindcss.js.hpp"
|
||||
#include "deps_vue.esm-browser.js.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
@@ -247,6 +238,7 @@ struct server_slot {
|
||||
if (is_processing()) {
|
||||
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
||||
|
||||
t_last_used = ggml_time_us();
|
||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||
state = SLOT_STATE_IDLE;
|
||||
callback_on_release(id);
|
||||
@@ -377,8 +369,8 @@ struct server_queue {
|
||||
std::condition_variable condition_tasks;
|
||||
|
||||
// callback functions
|
||||
std::function<void(server_task&)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
std::function<void(server_task)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
|
||||
// Add a new task to the end of the queue
|
||||
int post(server_task task, bool front = false) {
|
||||
@@ -430,7 +422,7 @@ struct server_queue {
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task &)> callback) {
|
||||
void on_new_task(std::function<void(server_task)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
@@ -480,7 +472,7 @@ struct server_queue {
|
||||
lock.unlock();
|
||||
|
||||
QUE_DBG("processing task, id = %d\n", task.id);
|
||||
callback_new_task(task);
|
||||
callback_new_task(std::move(task));
|
||||
}
|
||||
|
||||
// all tasks in the current loop is processed, slots data is now ready
|
||||
@@ -643,17 +635,12 @@ struct server_context {
|
||||
bool load_model(const common_params & params_) {
|
||||
params = params_;
|
||||
|
||||
// reserve one extra sequence (seq_id == 0) for extra features
|
||||
params.n_parallel += 1;
|
||||
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
loras = llama_init.lora_adapters;
|
||||
|
||||
params.n_parallel -= 1; // but be sneaky about it
|
||||
|
||||
if (model == nullptr) {
|
||||
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
|
||||
return false;
|
||||
@@ -668,11 +655,16 @@ struct server_context {
|
||||
}
|
||||
|
||||
bool validate_model_chat_template() const {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
|
||||
const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
|
||||
|
||||
return res > 0;
|
||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||
std::string template_key = "tokenizer.chat_template";
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||
if (res >= 0) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::string tmpl = std::string(model_template.data(), model_template.size());
|
||||
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void init() {
|
||||
@@ -725,12 +717,12 @@ struct server_context {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
server_slot * get_available_slot(const std::string & prompt) {
|
||||
server_slot * get_available_slot(const server_task & task) {
|
||||
server_slot * ret = nullptr;
|
||||
|
||||
// find the slot that has at least n% prompt similarity
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
|
||||
int max_lcp_len = 0;
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
||||
int lcs_len = 0;
|
||||
float similarity = 0;
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
@@ -740,25 +732,26 @@ struct server_context {
|
||||
}
|
||||
|
||||
// skip the slot if it does not contains cached tokens
|
||||
if (slot.prompt_tokens.empty()) {
|
||||
if (slot.cache_tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
||||
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
|
||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||
|
||||
// fraction of the common substring length compared to the current slot's prompt length
|
||||
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
|
||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||
|
||||
// select the current slot if the criteria match
|
||||
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
|
||||
max_lcp_len = lcp_len;
|
||||
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
|
||||
lcs_len = cur_lcs_len;
|
||||
similarity = cur_similarity;
|
||||
ret = &slot;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != nullptr) {
|
||||
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
|
||||
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1286,16 +1279,16 @@ struct server_context {
|
||||
|
||||
void send_embedding(const server_slot & slot, const llama_batch & batch) {
|
||||
server_task_result res;
|
||||
res.id = slot.id_task;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
res.id = slot.id_task;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
const int n_embd = llama_n_embd(model);
|
||||
|
||||
std::vector<float> embd_res(n_embd, 0.0f);
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1330,12 +1323,12 @@ struct server_context {
|
||||
|
||||
void send_rerank(const server_slot & slot, const llama_batch & batch) {
|
||||
server_task_result res;
|
||||
res.id = slot.id_task;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
res.id = slot.id_task;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1508,24 +1501,13 @@ struct server_context {
|
||||
// Functions to process the task
|
||||
//
|
||||
|
||||
void process_single_task(const server_task & task) {
|
||||
void process_single_task(server_task task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_INFERENCE:
|
||||
{
|
||||
const int id_slot = json_value(task.data, "id_slot", -1);
|
||||
|
||||
server_slot * slot;
|
||||
|
||||
if (id_slot != -1) {
|
||||
slot = get_slot_by_id(id_slot);
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
prompt = json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
}
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
@@ -1575,11 +1557,11 @@ struct server_context {
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
json slot_data = get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["id_task"] = slot.id_task;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
||||
slot_data["next_token"] = {
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["id_task"] = slot.id_task;
|
||||
slot_data["is_processing"] = slot.is_processing();
|
||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"has_new_line", slot.has_new_line},
|
||||
{"n_remain", slot.n_remaining},
|
||||
@@ -1590,10 +1572,10 @@ struct server_context {
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
|
||||
if (slot_data["state"] == SLOT_STATE_IDLE) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
if (slot.is_processing()) {
|
||||
n_processing_slots++;
|
||||
} else {
|
||||
n_idle_slots++;
|
||||
}
|
||||
|
||||
slots_data.push_back(slot_data);
|
||||
@@ -1655,7 +1637,7 @@ struct server_context {
|
||||
std::string filename = task.data.at("filename");
|
||||
std::string filepath = task.data.at("filepath");
|
||||
|
||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
|
||||
|
||||
const int64_t t_end = ggml_time_us();
|
||||
const double t_save_ms = (t_end - t_start) / 1000.0;
|
||||
@@ -1697,7 +1679,7 @@ struct server_context {
|
||||
|
||||
slot->cache_tokens.resize(slot->n_ctx);
|
||||
size_t token_count = 0;
|
||||
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||
if (nread == 0) {
|
||||
slot->cache_tokens.resize(0);
|
||||
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -1740,7 +1722,7 @@ struct server_context {
|
||||
|
||||
// Erase token cache
|
||||
const size_t n_erased = slot->cache_tokens.size();
|
||||
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
|
||||
slot->cache_tokens.clear();
|
||||
|
||||
server_task_result result;
|
||||
@@ -1817,8 +1799,8 @@ struct server_context {
|
||||
|
||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||
@@ -1845,7 +1827,7 @@ struct server_context {
|
||||
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
|
||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
||||
|
||||
slot.n_past += 1;
|
||||
|
||||
@@ -1992,8 +1974,8 @@ struct server_context {
|
||||
|
||||
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
|
||||
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
||||
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
|
||||
|
||||
for (size_t i = 0; i < n_match; i++) {
|
||||
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||
@@ -2042,9 +2024,9 @@ struct server_context {
|
||||
}
|
||||
|
||||
// keep only the common part
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
||||
// could not partially delete (likely using a non-Transformer model)
|
||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_past = 0;
|
||||
@@ -2057,7 +2039,7 @@ struct server_context {
|
||||
|
||||
// add prompt tokens for processing in the current batch
|
||||
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
||||
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
|
||||
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||
@@ -2299,16 +2281,6 @@ int main(int argc, char ** argv) {
|
||||
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
|
||||
|
||||
svr->set_default_headers({{"Server", "llama.cpp"}});
|
||||
|
||||
// CORS preflight
|
||||
svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
|
||||
// Access-Control-Allow-Origin is already set by middleware
|
||||
res.set_header("Access-Control-Allow-Credentials", "true");
|
||||
res.set_header("Access-Control-Allow-Methods", "POST");
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
return res.set_content("", "text/html"); // blank response, no data
|
||||
});
|
||||
|
||||
svr->set_logger(log_server_request);
|
||||
|
||||
auto res_error = [](httplib::Response & res, const json & error_data) {
|
||||
@@ -2421,6 +2393,14 @@ int main(int argc, char ** argv) {
|
||||
// register server middlewares
|
||||
svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
// If this is OPTIONS request, skip validation because browsers don't include Authorization header
|
||||
if (req.method == "OPTIONS") {
|
||||
res.set_header("Access-Control-Allow-Credentials", "true");
|
||||
res.set_header("Access-Control-Allow-Methods", "GET, POST");
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
res.set_content("", "text/html"); // blank response, no data
|
||||
return httplib::Server::HandlerResponse::Handled; // skip further processing
|
||||
}
|
||||
if (!middleware_server_state(req, res)) {
|
||||
return httplib::Server::HandlerResponse::Handled;
|
||||
}
|
||||
@@ -2714,8 +2694,8 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2820,8 +2800,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// TODO: maybe merge this function with "handle_completions_generic"
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2946,11 +2926,6 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
// TODO: somehow clean up this checks in the future
|
||||
if (!ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings` and without `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
const json body = json::parse(req.body);
|
||||
bool is_openai = false;
|
||||
|
||||
@@ -3002,10 +2977,11 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params.reranking) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
// TODO: implement
|
||||
@@ -3134,33 +3110,19 @@ int main(int argc, char ** argv) {
|
||||
// register static assets routes
|
||||
if (!params.public_path.empty()) {
|
||||
// Set the base directory for serving static files
|
||||
svr->set_base_dir(params.public_path);
|
||||
}
|
||||
|
||||
if (!params.api_keys.empty()) {
|
||||
// for now, if API key is set, web UI is unusable
|
||||
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
|
||||
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
|
||||
});
|
||||
bool is_found = svr->set_mount_point("/", params.public_path);
|
||||
if (!is_found) {
|
||||
LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
|
||||
// add new-ui files
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
|
||||
}
|
||||
|
||||
// register API routes
|
||||
|
||||
@@ -64,5 +64,5 @@ Feature: Security
|
||||
| localhost | Access-Control-Allow-Origin | localhost |
|
||||
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
|
||||
| origin | Access-Control-Allow-Credentials | true |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | GET, POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Headers | * |
|
||||
|
||||
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
|
||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||
match expected_slot_status_string:
|
||||
case 'idle':
|
||||
expected_slot_status = 0
|
||||
expected_slot_status = False
|
||||
case 'busy':
|
||||
expected_slot_status = 1
|
||||
expected_slot_status = True
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
|
||||
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
|
||||
for slot_id in range(context.n_slots)]
|
||||
await request_slots_status(context, expected_slots)
|
||||
|
||||
@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
|
||||
if status_code == 503 and status_code == expected_http_status_code:
|
||||
return
|
||||
if status_code == 200 and status_code == expected_http_status_code:
|
||||
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
|
||||
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
|
||||
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
|
||||
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
|
||||
if ((slots_idle is None or slots_idle == n_slots_idle)
|
||||
and (slots_processing is None or slots_processing == n_slots_processing)):
|
||||
return
|
||||
|
||||
@@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
|
||||
// other common utils
|
||||
//
|
||||
|
||||
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
||||
// check for empty sequences
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return i;
|
||||
// get the lengths of the input sequences
|
||||
size_t a_len = a.size();
|
||||
size_t b_len = b.size();
|
||||
|
||||
// initialize the maximum length of the longest common subsequence (LCS)
|
||||
size_t max_length = 0;
|
||||
|
||||
// use two rows instead of a 2D matrix to optimize space
|
||||
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||
|
||||
// iterate through the elements of a
|
||||
for (size_t i = 1; i <= a_len; i++) {
|
||||
// iterate through the elements of b
|
||||
for (size_t j = 1; j <= b_len; j++) {
|
||||
// if elements at the current positions match
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
// if it's the first element of either sequences, set LCS length to 1
|
||||
if (i == 1 || j == 1) {
|
||||
curr_row[j] = 1;
|
||||
} else {
|
||||
// increment LCS length by 1 compared to the previous element
|
||||
curr_row[j] = prev_row[j - 1] + 1;
|
||||
}
|
||||
|
||||
// update max_length if necessary
|
||||
if (curr_row[j] > max_length) {
|
||||
max_length = curr_row[j];
|
||||
}
|
||||
} else {
|
||||
// reset LCS length if elements don't match
|
||||
curr_row[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// update the previous row for the next iteration
|
||||
prev_row = curr_row;
|
||||
}
|
||||
|
||||
// return the maximum length of the LCS
|
||||
return max_length;
|
||||
}
|
||||
|
||||
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
set(TARGET llama-simple-chat)
|
||||
add_executable(${TARGET} simple-chat.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -0,0 +1,7 @@
|
||||
# llama.cpp/example/simple-chat
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
|
||||
|
||||
```bash
|
||||
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
|
||||
...
|
||||
@@ -0,0 +1,197 @@
|
||||
#include "llama.h"
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::string model_path;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
|
||||
// parse command line arguments
|
||||
for (int i = 1; i < argc; i++) {
|
||||
try {
|
||||
if (strcmp(argv[i], "-m") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
model_path = argv[++i];
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-c") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
n_ctx = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
ngl = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (model_path.empty()) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// only print errors
|
||||
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
||||
if (level >= GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
}
|
||||
}, nullptr);
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the context
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the sampler
|
||||
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
||||
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// convert the token to a string, print it and add it to the response
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
}
|
||||
std::string piece(buf, n);
|
||||
printf("%s", piece.c_str());
|
||||
fflush(stdout);
|
||||
response += piece;
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
return response;
|
||||
};
|
||||
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::vector<char> formatted(llama_n_ctx(ctx));
|
||||
int prev_len = 0;
|
||||
while (true) {
|
||||
// get user input
|
||||
printf("\033[32m> \033[0m");
|
||||
std::string user;
|
||||
std::getline(std::cin, user);
|
||||
|
||||
if (user.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// add the user input to the message list and format it
|
||||
messages.push_back({"user", strdup(user.c_str())});
|
||||
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
if (new_len > (int)formatted.size()) {
|
||||
formatted.resize(new_len);
|
||||
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
}
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// remove previous messages to obtain the prompt to generate the response
|
||||
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
||||
|
||||
// generate a response
|
||||
printf("\033[33m");
|
||||
std::string response = generate(prompt);
|
||||
printf("\n\033[0m");
|
||||
|
||||
// add the response to the messages
|
||||
messages.push_back({"assistant", strdup(response.c_str())});
|
||||
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
|
||||
if (prev_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// free resources
|
||||
for (auto & msg : messages) {
|
||||
free(const_cast<char *>(msg.content));
|
||||
}
|
||||
llama_sampler_free(smpl);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
@@ -188,6 +188,8 @@ int main(int argc, char ** argv) {
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.sparams.top_k = std::max(10, params.sparams.top_k);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
// allocate llama_sampler for each draft sequence
|
||||
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
|
||||
@@ -267,11 +269,12 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
||||
p_tgt = dist_tgt.data[i].p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < dist_dft.size; i++) {
|
||||
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
|
||||
p_dft = dist_dft.data[i].p;
|
||||
}
|
||||
if (p_tgt && p_dft) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -345,6 +348,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<float> probs(dist_tgt.size);
|
||||
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
||||
probs[i] = dist_tgt.data[i].p;
|
||||
LOG_DBG(" - %d: %f\n", dist_tgt.data[i].id, dist_tgt.data[i].p);
|
||||
}
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
@@ -448,10 +452,13 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (drafts[0].smpl) {
|
||||
common_sampler_free(drafts[0].smpl);
|
||||
}
|
||||
drafts[0].smpl = common_sampler_clone(smpl);
|
||||
// TODO: this needs better fix - we want the draft samplers to have different parameters from the target sampler
|
||||
// so we should not copy the target sampler
|
||||
//if (drafts[0].smpl) {
|
||||
// common_sampler_free(drafts[0].smpl);
|
||||
//}
|
||||
//drafts[0].smpl = common_sampler_clone(smpl);
|
||||
common_sampler_reset(drafts[0].smpl);
|
||||
|
||||
int n_seq_cur = 1;
|
||||
int n_past_cur = n_past_dft;
|
||||
@@ -539,6 +546,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int s = sa[is];
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[is].p < 0.90) {
|
||||
drafts[s].drafting = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(drafts[s].smpl, id, true);
|
||||
|
||||
drafts[s].tokens.push_back(id);
|
||||
@@ -576,6 +589,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// don't waste time on small batches
|
||||
if (batch_tgt.n_tokens < 5) {
|
||||
batch_tgt.n_tokens = 1;
|
||||
drafts[0].tokens.resize(batch_tgt.n_tokens);
|
||||
}
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
{
|
||||
llama_kv_cache_seq_keep(ctx_tgt, 0);
|
||||
|
||||
Generated
+10
-10
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1727826117,
|
||||
"narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=",
|
||||
"lastModified": 1730504689,
|
||||
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1",
|
||||
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1729665710,
|
||||
"narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=",
|
||||
"lastModified": 1730785428,
|
||||
"narHash": "sha256-Zwl8YgTVJTEum+L+0zVAWvXAGbWAuXHax3KzuejaDyo=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d",
|
||||
"rev": "4aa36568d413aca0ea84a1684d2d46f55dbabad7",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -36,14 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1727825735,
|
||||
"narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=",
|
||||
"lastModified": 1730504152,
|
||||
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
||||
+2
-1
@@ -153,6 +153,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
||||
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
||||
@@ -218,12 +219,12 @@ include(CMakePackageConfigHelpers)
|
||||
# all public headers
|
||||
set(GGML_PUBLIC_HEADERS
|
||||
include/ggml.h
|
||||
include/ggml-cpu.h
|
||||
include/ggml-alloc.h
|
||||
include/ggml-backend.h
|
||||
include/ggml-blas.h
|
||||
include/ggml-cann.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
|
||||
@@ -305,27 +305,10 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
// ggml
|
||||
|
||||
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
||||
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
||||
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
// ggml-alloc
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
||||
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
||||
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
||||
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
||||
@@ -0,0 +1,150 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
// TODO: move to backend interface
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
+30
-148
@@ -217,7 +217,6 @@
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_PARAMS 2048
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_SRC 10
|
||||
#define GGML_MAX_N_THREADS 512
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
@@ -510,7 +509,7 @@ extern "C" {
|
||||
GGML_OP_WIN_UNPART,
|
||||
GGML_OP_GET_REL_POS,
|
||||
GGML_OP_ADD_REL_POS,
|
||||
GGML_OP_RWKV_WKV,
|
||||
GGML_OP_RWKV_WKV6,
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
@@ -559,10 +558,10 @@ extern "C" {
|
||||
|
||||
enum ggml_log_level {
|
||||
GGML_LOG_LEVEL_NONE = 0,
|
||||
GGML_LOG_LEVEL_INFO = 1,
|
||||
GGML_LOG_LEVEL_WARN = 2,
|
||||
GGML_LOG_LEVEL_ERROR = 3,
|
||||
GGML_LOG_LEVEL_DEBUG = 4,
|
||||
GGML_LOG_LEVEL_DEBUG = 1,
|
||||
GGML_LOG_LEVEL_INFO = 2,
|
||||
GGML_LOG_LEVEL_WARN = 3,
|
||||
GGML_LOG_LEVEL_ERROR = 4,
|
||||
GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
||||
};
|
||||
|
||||
@@ -574,6 +573,13 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
@@ -619,66 +625,6 @@ extern "C" {
|
||||
// If it returns true, the computation is aborted
|
||||
typedef bool (*ggml_abort_callback)(void * data);
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
void * data;
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
//
|
||||
// GUID
|
||||
@@ -701,9 +647,6 @@ extern "C" {
|
||||
// accepts a UTF-8 path, even on Windows
|
||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
@@ -760,12 +703,12 @@ extern "C" {
|
||||
|
||||
// main
|
||||
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
GGML_API void ggml_free(struct ggml_context * ctx);
|
||||
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
||||
GGML_API void ggml_reset(struct ggml_context * ctx);
|
||||
GGML_API void ggml_free (struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
||||
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
||||
|
||||
@@ -805,8 +748,7 @@ extern "C" {
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||
@@ -816,35 +758,25 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
// Converts a flat index into coordinates
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
// Tensor flags
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
//
|
||||
@@ -1814,6 +1746,9 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
||||
const struct ggml_tensor * a);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1887,7 +1822,7 @@ extern "C" {
|
||||
struct ggml_tensor * pw,
|
||||
struct ggml_tensor * ph);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
|
||||
GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
@@ -2060,9 +1995,6 @@ extern "C" {
|
||||
// automatic differentiation
|
||||
//
|
||||
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
||||
|
||||
@@ -2094,27 +2026,6 @@ extern "C" {
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
@@ -2285,6 +2196,8 @@ extern "C" {
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
@@ -2316,12 +2229,6 @@ extern "C" {
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// tensor flags
|
||||
//
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
@@ -2490,8 +2397,6 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
@@ -2508,17 +2413,9 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
GGML_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
@@ -2527,14 +2424,6 @@ extern "C" {
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits {
|
||||
const char * type_name;
|
||||
@@ -2545,13 +2434,6 @@ extern "C" {
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
||||
+16
-4
@@ -58,6 +58,10 @@ if (GGML_METAL)
|
||||
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL_USE_BF16)
|
||||
add_compile_definitions(GGML_METAL_USE_BF16)
|
||||
endif()
|
||||
|
||||
# copy ggml-common.h and ggml-metal.metal to bin directory
|
||||
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
@@ -1261,8 +1265,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
|
||||
OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND ${POWER10_M} "POWER10" substring_index)
|
||||
if(${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
@@ -1366,9 +1375,12 @@ endif()
|
||||
|
||||
add_library(ggml
|
||||
../include/ggml.h
|
||||
../include/ggml-cpu.h
|
||||
../include/ggml-alloc.h
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
ggml.c
|
||||
ggml-cpu.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-quants.c
|
||||
@@ -1393,7 +1405,7 @@ if (EMSCRIPTEN)
|
||||
endif()
|
||||
|
||||
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
||||
target_include_directories(ggml PUBLIC ../include)
|
||||
target_include_directories(ggml PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
||||
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
||||
@@ -1402,7 +1414,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT GGML_SYCL)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
+628
-611
File diff suppressed because it is too large
Load Diff
@@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||
|
||||
return buffer;
|
||||
|
||||
+13834
File diff suppressed because it is too large
Load Diff
+17
-6
@@ -36,7 +36,7 @@
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/rwkv-wkv.cuh"
|
||||
#include "ggml-cuda/wkv6.cuh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
||||
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
} else {
|
||||
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||
if (err != cudaErrorPeerAccessNotEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2313,8 +2319,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_RWKV_WKV:
|
||||
ggml_cuda_op_rwkv_wkv(ctx, dst);
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
ggml_cuda_op_rwkv_wkv6(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
ggml_cuda_cross_entropy_loss_back(ctx, dst);
|
||||
@@ -3107,18 +3113,20 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
@@ -3145,12 +3153,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_RWKV_WKV:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT: {
|
||||
#ifndef FLASH_ATTN_AVAILABLE
|
||||
return false;
|
||||
#endif
|
||||
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
|
||||
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
|
||||
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
|
||||
|
||||
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
|
||||
|
||||
|
||||
@@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
if (precision != GGML_PREC_DEFAULT) {
|
||||
if (prec != GGML_PREC_DEFAULT) {
|
||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||
constexpr int cols_per_block = 16;
|
||||
switch (Q->ne[0]) {
|
||||
@@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= CC_OFFSET_AMD) {
|
||||
if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
@@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||
if (precision == GGML_PREC_DEFAULT) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
return;
|
||||
} else if(Q->ne[0] <= 128) {
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_WKV_BLOCK_SIZE 64
|
||||
|
||||
void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "common.cuh"
|
||||
#include "rwkv-wkv.cuh"
|
||||
#include "wkv6.cuh"
|
||||
|
||||
static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
|
||||
const int tid = threadIdx.x;
|
||||
@@ -64,7 +64,7 @@ static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const float * k_d = (const float *)dst->src[0]->data;
|
||||
const float * v_d = (const float *)dst->src[1]->data;
|
||||
const float * r_d = (const float *)dst->src[2]->data;
|
||||
@@ -83,7 +83,7 @@ void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(C % H == 0);
|
||||
GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE);
|
||||
GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); // The current cuda kernel is designed for RWKV6, HEAD_SIZE == 64
|
||||
|
||||
rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_WKV_BLOCK_SIZE 64
|
||||
|
||||
void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -36,6 +37,20 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int ggml_up32(int n) {
|
||||
return (n + 31) & ~31;
|
||||
}
|
||||
|
||||
//static inline int ggml_up64(int n) {
|
||||
// return (n + 63) & ~63;
|
||||
//}
|
||||
|
||||
static inline int ggml_up(int n, int m) {
|
||||
// assert m is a power of 2
|
||||
GGML_ASSERT((m & (m - 1)) == 0);
|
||||
return (n + m - 1) & ~(m - 1);
|
||||
}
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
@@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
||||
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||
|
||||
#define GGML_DEBUG 0
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
// tensor params
|
||||
|
||||
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
||||
assert(params_size <= GGML_MAX_OP_PARAMS);
|
||||
memcpy(tensor->op_params, params, params_size);
|
||||
}
|
||||
|
||||
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
return ((const int32_t *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
return ((const float *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
((int32_t *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
((float *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
struct ggml_map_custom1_op_params {
|
||||
ggml_custom1_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom2_op_params {
|
||||
ggml_custom2_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
@@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
// TODO: move to threading file
|
||||
void ggml_critical_section_start(void);
|
||||
void ggml_critical_section_end(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
+558
-194
File diff suppressed because it is too large
Load Diff
+1126
-798
File diff suppressed because it is too large
Load Diff
+38
-51
@@ -4,7 +4,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
@@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xF);
|
||||
const __m128i m3 = _mm_set1_epi8(3);
|
||||
const __m128i m32s = _mm_set1_epi8(32);
|
||||
const __m128i m2 = _mm_set1_epi8(2);
|
||||
const __m128i m15 = _mm_set1_epi8(15);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
@@ -9119,12 +9117,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
// handle the q6_k -32 offset separately using bsums
|
||||
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
|
||||
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
|
||||
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
||||
const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
|
||||
const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
|
||||
const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
|
||||
const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
|
||||
|
||||
__m128i sumi_0 = _mm_setzero_si128();
|
||||
__m128i sumi_1 = _mm_setzero_si128();
|
||||
|
||||
__m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
||||
int is = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
|
||||
const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
|
||||
@@ -9132,26 +9138,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
|
||||
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
|
||||
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
|
||||
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
|
||||
const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
|
||||
const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
|
||||
const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
|
||||
const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
|
||||
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
|
||||
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
|
||||
const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
|
||||
const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
|
||||
const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
|
||||
const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
|
||||
|
||||
const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
||||
|
||||
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
|
||||
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
|
||||
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
|
||||
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
|
||||
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
|
||||
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
|
||||
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
|
||||
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
|
||||
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
|
||||
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
|
||||
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
|
||||
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
|
||||
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
|
||||
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
|
||||
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
|
||||
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
|
||||
|
||||
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
@@ -9162,15 +9168,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
||||
|
||||
__m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
|
||||
__m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
|
||||
__m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
|
||||
__m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
|
||||
__m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
|
||||
__m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
|
||||
__m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
|
||||
__m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
|
||||
|
||||
__m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
|
||||
__m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
|
||||
__m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
|
||||
@@ -9180,32 +9177,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
__m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
|
||||
__m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
|
||||
|
||||
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
||||
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
||||
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
||||
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
||||
p16_4 = _mm_sub_epi16(p16_4, q8s_4);
|
||||
p16_5 = _mm_sub_epi16(p16_5, q8s_5);
|
||||
p16_6 = _mm_sub_epi16(p16_6, q8s_6);
|
||||
p16_7 = _mm_sub_epi16(p16_7, q8s_7);
|
||||
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
|
||||
shuffle = _mm_add_epi8(shuffle, m2);
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
||||
is += 4;
|
||||
|
||||
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
||||
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
|
||||
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
|
||||
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
||||
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
|
||||
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
|
||||
p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
|
||||
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
|
||||
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
|
||||
p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
|
||||
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
|
||||
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
|
||||
|
||||
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
||||
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
||||
@@ -9214,8 +9199,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
}
|
||||
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
|
||||
sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
|
||||
const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
@@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
UNUSED(dev);
|
||||
UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
UNUSED(dev);
|
||||
UNUSED(op);
|
||||
@@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
|
||||
+179
-946
File diff suppressed because it is too large
Load Diff
@@ -26,5 +26,8 @@
|
||||
#include "softmax.hpp"
|
||||
#include "tsembd.hpp"
|
||||
#include "im2col.hpp"
|
||||
#include "wkv6.hpp"
|
||||
#include "outprod.hpp"
|
||||
#include "element_wise.hpp"
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
||||
@@ -62,3 +62,43 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||
}
|
||||
return sycl_down_blk_size;
|
||||
}
|
||||
|
||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const ggml_sycl_op_flatten_t op) try {
|
||||
const int64_t nrows0 = ggml_nrows(src0);
|
||||
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||
|
||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
// dd = data device
|
||||
float * src0_ddf = (float *) src0->data;
|
||||
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> dst_f(ctx.pool());
|
||||
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
// GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
|
||||
// ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
|
||||
|
||||
// do the computation
|
||||
op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||
// print_ggml_tensor("tensor", dst);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
@@ -404,4 +404,262 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
|
||||
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
|
||||
|
||||
typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst, const float *src0_dd,
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const queue_ptr &main_stream);
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i_src0;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
}
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i_src0;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(ggml_backend_sycl_context & ctx,
|
||||
const struct ggml_tensor *src0,
|
||||
const struct ggml_tensor *src1, struct ggml_tensor *dst,
|
||||
const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
|
||||
queue_ptr stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
int nr0 = ne10/ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne0[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb0[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne0[0];
|
||||
int64_t ne1 = cne0[1];
|
||||
int64_t ne2 = cne0[2];
|
||||
int64_t ne3 = cne0[3];
|
||||
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
size_t nb0 = cnb0[0];
|
||||
size_t nb1 = cnb0[1];
|
||||
size_t nb2 = cnb0[2];
|
||||
size_t nb3 = cnb0[3];
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
|
||||
s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class op>
|
||||
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
|
||||
(sycl::half *)dst_dd, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||
op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
|
||||
main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const ggml_sycl_op_flatten_t op);
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -106,6 +106,7 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
||||
});
|
||||
break;
|
||||
// dim >=2 will be dispatched to the default path
|
||||
default:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,76 @@
|
||||
#ifndef GGML_SYCL_ELEMENTWISE_HPP
|
||||
#define GGML_SYCL_ELEMENTWISE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_add(const float a, const float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_sub(const float a, const float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_mul(const float a, const float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_div(const float a, const float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_ELEMENTWISE_HPP
|
||||
@@ -8,7 +8,6 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
@@ -55,7 +54,6 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
||||
int end = start + group_size;
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
start += item_ct1.get_local_id(2);
|
||||
int nreduce = nwarps / WARP_SIZE;
|
||||
|
||||
@@ -144,7 +142,6 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
assert(nwarps % WARP_SIZE == 0);
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
@@ -202,6 +199,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||
@@ -244,6 +242,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||
@@ -290,6 +289,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
}
|
||||
else {
|
||||
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||
/*
|
||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include "outprod.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst) {
|
||||
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// Get SYCL queue
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
// Dimension checks
|
||||
GGML_ASSERT(ne01 == ne11); // Inner dimensions must match
|
||||
GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows
|
||||
GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols
|
||||
|
||||
// Get data pointers
|
||||
const float* src0_d = (const float*)src0->data;
|
||||
const float* src1_d = (const float*)src1->data;
|
||||
float* dst_d = (float*)dst->data;
|
||||
|
||||
// GEMM parameters
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
||||
// Handle transposition of src1
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
const oneapi::mkl::transpose src1_op =
|
||||
src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream,
|
||||
oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01,
|
||||
alpha,
|
||||
src0_d, ne00,
|
||||
src1_d, ldb,
|
||||
beta,
|
||||
dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
#ifndef GGML_SYCL_OUTPROD_HPP
|
||||
#define GGML_SYCL_OUTPROD_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst);
|
||||
|
||||
|
||||
#endif // GGML_SYCL_OUTPROD_HPP
|
||||
|
||||
@@ -25,6 +25,11 @@
|
||||
#define SYCL_RELU_BLOCK_SIZE 256
|
||||
#define SYCL_HARDSIGMOID_BLOCK_SIZE 256
|
||||
#define SYCL_HARDSWISH_BLOCK_SIZE 256
|
||||
#define SYCL_EXP_BLOCK_SIZE 256
|
||||
#define SYCL_NEG_BLOCK_SIZE 256
|
||||
#define SYCL_SIGMOID_BLOCK_SIZE 256
|
||||
#define SYCL_SQRT_BLOCK_SIZE 256
|
||||
#define SYCL_SIN_BLOCK_SIZE 256
|
||||
#define SYCL_SQR_BLOCK_SIZE 256
|
||||
#define SYCL_CPY_BLOCK_SIZE 32
|
||||
#define SYCL_SCALE_BLOCK_SIZE 256
|
||||
@@ -41,6 +46,7 @@
|
||||
#define SYCL_ACC_BLOCK_SIZE 256
|
||||
#define SYCL_IM2COL_BLOCK_SIZE 256
|
||||
#define SYCL_POOL2D_BLOCK_SIZE 256
|
||||
#define SYCL_ARGMAX_BLOCK_SIZE 256
|
||||
#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
||||
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include "wkv6.hpp"
|
||||
|
||||
constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE
|
||||
|
||||
// Helper function for the main kernel
|
||||
static void rwkv_wkv_f32_kernel(
|
||||
const int B, const int T, const int C, const int H,
|
||||
const float* k, const float* v, const float* r,
|
||||
const float* tf, const float* td, const float* s,
|
||||
float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) {
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int bid = item_ct1.get_group(2);
|
||||
|
||||
const int head_size = WKV_BLOCK_SIZE;
|
||||
const int batch_i = bid / H;
|
||||
const int head_i = bid % H;
|
||||
const int state_size = C * head_size;
|
||||
const int n_seq_tokens = T / B;
|
||||
|
||||
// Set up shared memory pointers
|
||||
float* _k = shared_mem;
|
||||
float* _r = _k + head_size;
|
||||
float* _tf = _r + head_size;
|
||||
float* _td = _tf + head_size;
|
||||
|
||||
// Local state array
|
||||
float state[WKV_BLOCK_SIZE];
|
||||
|
||||
// Load initial state
|
||||
#pragma unroll
|
||||
for (int i = 0; i < head_size; i++) {
|
||||
state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
|
||||
}
|
||||
|
||||
// Sync threads before shared memory operations
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// Load time-mixing parameters
|
||||
_tf[tid] = tf[head_i * head_size + tid];
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// Main sequence processing loop
|
||||
for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
|
||||
t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid;
|
||||
t += C) {
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// Load current timestep data to shared memory
|
||||
_k[tid] = k[t];
|
||||
_r[tid] = r[t];
|
||||
_td[tid] = td[t];
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
const float _v = v[t];
|
||||
float y = 0;
|
||||
|
||||
// Process in chunks of 4 for better vectorization
|
||||
sycl::float4 k4, r4, tf4, td4, s4, kv4;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < head_size; j += 4) {
|
||||
// Load data in vec4 chunks
|
||||
k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
||||
r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
||||
tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
|
||||
td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
|
||||
s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
|
||||
// Compute key-value product
|
||||
sycl::float4 kv4 = k4 * _v;
|
||||
|
||||
// Accumulate weighted sum
|
||||
y += sycl::dot(r4, tf4 * kv4 + s4);
|
||||
|
||||
// Update state
|
||||
s4 = s4 * td4 + kv4;
|
||||
|
||||
// Store updated state
|
||||
state[j] = s4.x();
|
||||
state[j+1] = s4.y();
|
||||
state[j+2] = s4.z();
|
||||
state[j+3] = s4.w();
|
||||
}
|
||||
|
||||
dst[t] = y;
|
||||
}
|
||||
|
||||
// Save final state
|
||||
#pragma unroll
|
||||
for (int i = 0; i < head_size; i++) {
|
||||
dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||
const ggml_tensor* src1, ggml_tensor* dst) {
|
||||
|
||||
const float* k_d = (const float*)dst->src[0]->data;
|
||||
const float* v_d = (const float*)dst->src[1]->data;
|
||||
const float* r_d = (const float*)dst->src[2]->data;
|
||||
const float* tf_d = (const float*)dst->src[3]->data;
|
||||
const float* td_d = (const float*)dst->src[4]->data;
|
||||
const float* s_d = (const float*)dst->src[5]->data;
|
||||
float* dst_d = (float*)dst->data;
|
||||
|
||||
const int64_t B = dst->src[5]->ne[1];
|
||||
const int64_t T = dst->src[0]->ne[3];
|
||||
const int64_t C = dst->ne[0];
|
||||
const int64_t H = dst->src[0]->ne[2];
|
||||
|
||||
GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(C % H == 0);
|
||||
GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64
|
||||
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
// Calculate execution configuration
|
||||
const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td
|
||||
sycl::range<3> block_dims(1, 1, C / H);
|
||||
sycl::range<3> grid_dims(1, 1, B * H);
|
||||
|
||||
// Submit kernel
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rwkv_wkv_f32_kernel(
|
||||
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
||||
item_ct1, shared_mem_acc.get_pointer()
|
||||
);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef GGML_SYCL_WKV6_HPP
|
||||
#define GGML_SYCL_WKV6_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor * dst);
|
||||
|
||||
|
||||
#endif // GGML_SYCL_WKV6_HPP
|
||||
+221
-482
File diff suppressed because it is too large
Load Diff
+145
-15275
File diff suppressed because it is too large
Load Diff
@@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
|
||||
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
|
||||
#if defined(__MMA__)
|
||||
typedef vector unsigned char vec_t;
|
||||
typedef __vector_quad acc_t;
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED FUSED MULTIPLY ADD
|
||||
|
||||
@@ -1026,6 +1030,600 @@ class tinyBLAS_Q0_AVX {
|
||||
};
|
||||
#endif // __AVX__
|
||||
|
||||
//PPC Implementation
|
||||
#if defined(__MMA__)
|
||||
|
||||
#define SAVE_ACC(ACC, ii, jj) \
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC); \
|
||||
for (int I = 0; I < 4; I++) { \
|
||||
for (int J = 0; J < 4; J++) { \
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
|
||||
} \
|
||||
} \
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_PPC {
|
||||
public:
|
||||
tinyBLAS_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
||||
|
||||
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
|
||||
int64_t i, j;
|
||||
float *aoffset = NULL, *boffset = NULL;
|
||||
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
|
||||
aoffset = const_cast<float*>(a);
|
||||
boffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
|
||||
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
|
||||
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
|
||||
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
|
||||
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
__builtin_vsx_disassemble_pair(c5, &C5);
|
||||
__builtin_vsx_disassemble_pair(c6, &C6);
|
||||
__builtin_vsx_disassemble_pair(c7, &C7);
|
||||
__builtin_vsx_disassemble_pair(c8, &C8);
|
||||
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_mergeh(c5[0], c6[0]);
|
||||
t4 = vec_mergeh(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset);
|
||||
vec_xst(t6, 0, boffset+4);
|
||||
vec_xst(t7, 0, boffset+8);
|
||||
vec_xst(t8, 0, boffset+12);
|
||||
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_mergel(c5[0], c6[0]);
|
||||
t4 = vec_mergel(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset+16);
|
||||
vec_xst(t6, 0, boffset+20);
|
||||
vec_xst(t7, 0, boffset+24);
|
||||
vec_xst(t8, 0, boffset+28);
|
||||
|
||||
t1 = vec_mergeh(c1[1], c2[1]);
|
||||
t2 = vec_mergeh(c3[1], c4[1]);
|
||||
t3 = vec_mergeh(c5[1], c6[1]);
|
||||
t4 = vec_mergeh(c7[1], c8[1]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset+32);
|
||||
vec_xst(t6, 0, boffset+36);
|
||||
vec_xst(t7, 0, boffset+40);
|
||||
vec_xst(t8, 0, boffset+44);
|
||||
|
||||
t1 = vec_mergel(c1[1], c2[1]);
|
||||
t2 = vec_mergel(c3[1], c4[1]);
|
||||
t3 = vec_mergel(c5[1], c6[1]);
|
||||
t4 = vec_mergel(c7[1], c8[1]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset+48);
|
||||
vec_xst(t6, 0, boffset+52);
|
||||
vec_xst(t7, 0, boffset+56);
|
||||
vec_xst(t8, 0, boffset+60);
|
||||
|
||||
aoffset1 += 8*lda;
|
||||
aoffset2 += 8*lda;
|
||||
aoffset3 += 8*lda;
|
||||
aoffset4 += 8*lda;
|
||||
boffset += 64;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4, c5, c6, c7, c8;
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
c5 = vec_xl(0, aoffset5);
|
||||
c6 = vec_xl(0, aoffset6);
|
||||
c7 = vec_xl(0, aoffset7);
|
||||
c8 = vec_xl(0, aoffset8);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t3 = vec_mergeh(c5, c6);
|
||||
t4 = vec_mergeh(c7, c8);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset);
|
||||
vec_xst(t6, 0, boffset+4);
|
||||
vec_xst(t7, 0, boffset+8);
|
||||
vec_xst(t8, 0, boffset+12);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t3 = vec_mergel(c5, c6);
|
||||
t4 = vec_mergel(c7, c8);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset+16);
|
||||
vec_xst(t6, 0, boffset+20);
|
||||
vec_xst(t7, 0, boffset+24);
|
||||
vec_xst(t8, 0, boffset+28);
|
||||
}
|
||||
j--;
|
||||
} while(j > 0);
|
||||
}
|
||||
|
||||
if (rows & 4) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4;
|
||||
vector float c1[2], c2[2], c3[2], c4[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_mergel(c1[0], c2[0]);
|
||||
t4 = vec_mergel(c3[0], c4[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t1, t2, 3);
|
||||
t7 = vec_xxpermdi(t3, t4, 0);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset);
|
||||
vec_xst(t6, 0, boffset+4);
|
||||
vec_xst(t7, 0, boffset+8);
|
||||
vec_xst(t8, 0, boffset+12);
|
||||
|
||||
t1 = vec_mergeh(c1[1], c2[1]);
|
||||
t2 = vec_mergeh(c3[1], c4[1]);
|
||||
t3 = vec_mergel(c1[1], c2[1]);
|
||||
t4 = vec_mergel(c3[1], c4[1]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t1, t2, 3);
|
||||
t7 = vec_xxpermdi(t3, t4, 0);
|
||||
t8 = vec_xxpermdi(t3, t4, 3);
|
||||
vec_xst(t5, 0, boffset+16);
|
||||
vec_xst(t6, 0, boffset+20);
|
||||
vec_xst(t7, 0, boffset+24);
|
||||
vec_xst(t8, 0, boffset+28);
|
||||
|
||||
aoffset1 += 8*lda;
|
||||
aoffset2 += 8*lda;
|
||||
aoffset3 += 8*lda;
|
||||
aoffset4 += 8*lda;
|
||||
boffset += 32;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4;
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
vec_xst(t4, 0, boffset+12);
|
||||
}
|
||||
}
|
||||
if (rows & 3) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4 = {0};
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
vec_xst(t4, 0, boffset+12);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
for (int l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
}
|
||||
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[8], vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
}
|
||||
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[4], vec_C[4];
|
||||
acc_t acc_0, acc_1;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii+4, jj);
|
||||
}
|
||||
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[16], vec_C[4];
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
|
||||
for(int x = 0; x < 16; x+=2) {
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
|
||||
__builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
|
||||
}
|
||||
}
|
||||
SAVE_ACC(&acc_0, ii, jj);
|
||||
SAVE_ACC(&acc_1, ii, jj+4);
|
||||
SAVE_ACC(&acc_2, ii+4, jj);
|
||||
SAVE_ACC(&acc_3, ii+4, jj+4);
|
||||
}
|
||||
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
int m_rem = MIN(m - m0, 16);
|
||||
int n_rem = MIN(n - n0, 16);
|
||||
if (m_rem >= 16 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if(m_rem >= 8 && n_rem >= 16) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 8 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||
mc = 4;
|
||||
nc = 8;
|
||||
gemm<4,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 8 && n_rem >= 4) {
|
||||
mc = 8;
|
||||
nc = 4;
|
||||
gemm<8,4>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm<4,4>(m0, m, n0, n);
|
||||
} else if ((m_rem < 4) && (n_rem > 4)) {
|
||||
nc = 4;
|
||||
switch(m_rem) {
|
||||
case 1:
|
||||
mc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 2:
|
||||
mc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 3:
|
||||
mc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||
mc = 4;
|
||||
switch(n_rem) {
|
||||
case 1:
|
||||
nc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 2:
|
||||
nc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 3:
|
||||
nc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
switch((m_rem << 4) | n_rem) {
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm_small(m0, m, n0, n, mc, nc);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
vec_t vec_A[4], vec_B[4];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
if (RN >= 4 && RM == 1) {
|
||||
float* a = const_cast<float*>(A+(ii)*lda+l);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
|
||||
} else {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
|
||||
}
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (RM == 4 && RN == 4) {
|
||||
kernel = &tinyBLAS_PPC::KERNEL_4x4;
|
||||
} else if (RM == 4 && RN == 8) {
|
||||
kernel = &tinyBLAS_PPC::KERNEL_4x8;
|
||||
} else if (RM == 8 && RN == 4) {
|
||||
kernel = &tinyBLAS_PPC::KERNEL_8x4;
|
||||
} else if (RM == 8 && RN == 8) {
|
||||
kernel = &tinyBLAS_PPC::KERNEL_8x8;
|
||||
}
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
(this->*kernel)(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *C;
|
||||
TA *At;
|
||||
TB *Bt;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
@@ -1114,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__MMA__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
tinyBLAS_PPC<float, float, float> tb{
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.x;
|
||||
if (idx >= p.ne) {
|
||||
@@ -15,10 +17,13 @@ void main() {
|
||||
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
|
||||
const uint ox = src1_i % p.nb01;
|
||||
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
} else {
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,14 +1,29 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
const uint num_threads = 256;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 2;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user