mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
35 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f6ea7a093c | |||
| 0efec57787 | |||
| 7acfd4e8d5 | |||
| 97bdd26eee | |||
| 4db8f60fe7 | |||
| 8fac431b06 | |||
| f17f39ff9c | |||
| 9104bc20ed | |||
| fc690b018e | |||
| 16bdfa42ac | |||
| 3dfda05956 | |||
| bda62d7999 | |||
| 090fca7a07 | |||
| aaab2419ea | |||
| 73cf442e7b | |||
| e236528e76 | |||
| fa79495bb4 | |||
| 17eb6aa8a9 | |||
| c917b67f06 | |||
| 4e24cffd8c | |||
| 6af51c0d96 | |||
| f53226245f | |||
| c3ebcfa148 | |||
| 8a4441ea1a | |||
| 5aefbce27a | |||
| 71c1121d11 | |||
| 370b1f7e7a | |||
| b549a1bbef | |||
| 368645698a | |||
| b078c619aa | |||
| 808aba3916 | |||
| a977c11544 | |||
| 9a55ffe6fb | |||
| 7a221b672e | |||
| 278d0e1846 |
@@ -18,6 +18,7 @@
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
curl,
|
||||
shaderc,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
@@ -146,6 +147,7 @@ let
|
||||
vulkanBuildInputs = [
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
shaderc
|
||||
];
|
||||
in
|
||||
|
||||
|
||||
+1
-1
@@ -8,7 +8,7 @@ arg1="$1"
|
||||
shift
|
||||
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert-hf-to-gguf.py "$@"
|
||||
python3 ./convert_hf_to_gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
|
||||
@@ -355,8 +355,10 @@ jobs:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libvulkan-dev
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential vulkan-sdk
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -61,6 +61,11 @@ llama-batched-swift
|
||||
out/
|
||||
tmp/
|
||||
|
||||
# Deprecated
|
||||
|
||||
/main
|
||||
/server
|
||||
|
||||
# CI
|
||||
|
||||
!.github/workflows/*.yml
|
||||
|
||||
+10
-1
@@ -132,7 +132,16 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
|
||||
|
||||
# At the moment some compile definitions are placed within the ggml/src
|
||||
# directory but not exported on the `ggml` target. This could be improved by
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# package.
|
||||
#
|
||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
|
||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
@@ -197,6 +197,10 @@ ifdef GGML_RPC
|
||||
BUILD_TARGETS += rpc-server
|
||||
endif
|
||||
|
||||
ifdef GGML_VULKAN
|
||||
BUILD_TARGETS += vulkan-shaders-gen
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@@ -547,11 +551,17 @@ ifdef GGML_OPENBLAS64
|
||||
endif # GGML_OPENBLAS64
|
||||
|
||||
ifdef GGML_BLIS
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
endif # GGML_BLIS
|
||||
|
||||
ifdef GGML_NVPL
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
|
||||
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
endif # GGML_NVPL
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
||||
@@ -704,8 +714,8 @@ endif # GGML_CUDA
|
||||
|
||||
ifdef GGML_VULKAN
|
||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||
MK_LDFLAGS += -lvulkan
|
||||
OBJ_GGML += ggml/src/ggml-vulkan.o
|
||||
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
||||
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
|
||||
|
||||
ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
@@ -727,10 +737,28 @@ ifdef GGML_VULKAN_RUN_TESTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
|
||||
endif
|
||||
|
||||
ggml/src/ggml-vulkan.o: \
|
||||
ggml/src/ggml-vulkan.cpp \
|
||||
ggml/include/ggml-vulkan.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
GLSLC_CMD = glslc
|
||||
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
|
||||
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
|
||||
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
|
||||
_ggml_vk_input_dir = ggml/src/vulkan-shaders
|
||||
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
|
||||
|
||||
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
|
||||
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
|
||||
|
||||
$(_ggml_vk_header): $(_ggml_vk_source)
|
||||
|
||||
$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
|
||||
$(_ggml_vk_genshaders_cmd) \
|
||||
--glslc $(GLSLC_CMD) \
|
||||
--input-dir $(_ggml_vk_input_dir) \
|
||||
--target-hpp $(_ggml_vk_header) \
|
||||
--target-cpp $(_ggml_vk_source)
|
||||
|
||||
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||
|
||||
endif # GGML_VULKAN
|
||||
|
||||
ifdef GGML_HIPBLAS
|
||||
@@ -1110,6 +1138,7 @@ clean:
|
||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||
rm -rvf $(BUILD_TARGETS)
|
||||
rm -rvf $(TEST_TARGETS)
|
||||
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
||||
rm -rvf $(LEGACY_TARGETS_CLEAN)
|
||||
find examples pocs -type f -name "*.o" -delete
|
||||
|
||||
|
||||
@@ -8,6 +8,13 @@ set(GGML_CUDA @GGML_CUDA@)
|
||||
set(GGML_METAL @GGML_METAL@)
|
||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||
set(GGML_VULKAN @GGML_VULKAN@)
|
||||
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
||||
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
|
||||
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
|
||||
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
|
||||
set(GGML_SYCL @GGML_SYCL@)
|
||||
set(GGML_OPENMP @GGML_OPENMP@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
@@ -37,18 +44,36 @@ if (GGML_METAL)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
endif()
|
||||
|
||||
|
||||
find_library(ggml_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
|
||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
||||
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
||||
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||
|
||||
add_library(llama UNKNOWN IMPORTED)
|
||||
|
||||
|
||||
+8
-10
@@ -685,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
if (arg == "--lora") {
|
||||
CHECK_ARG
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--lora-scaled") {
|
||||
@@ -693,7 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
const char* lora_adapter = argv[i];
|
||||
CHECK_ARG
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--lora-base") {
|
||||
@@ -797,6 +795,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.cont_batching = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "-nocb" || arg == "--no-cont-batching") {
|
||||
params.cont_batching = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "-fa" || arg == "--flash-attn") {
|
||||
params.flash_attn = true;
|
||||
return true;
|
||||
@@ -1538,6 +1540,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
||||
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
||||
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
||||
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
|
||||
|
||||
options.push_back({ "multi-modality" });
|
||||
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
||||
@@ -2084,19 +2087,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||
int err = llama_model_apply_lora_from_file(model,
|
||||
lora_adapter.c_str(),
|
||||
lora_scale,
|
||||
((i > 0) || params.lora_base.empty())
|
||||
? NULL
|
||||
: params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
||||
if (adapter == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
|
||||
+125
-77
@@ -148,9 +148,16 @@ class Model:
|
||||
tensor_names_from_parts.update(model_part.keys())
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
|
||||
if self.lazy:
|
||||
data = LazyTorchTensor.from_eager(data)
|
||||
if self.is_safetensors:
|
||||
if self.lazy:
|
||||
data = model_part.get_slice(name)
|
||||
data = LazyTorchTensor.from_safetensors_slice(data)
|
||||
else:
|
||||
data = model_part.get_tensor(name)
|
||||
else:
|
||||
data = model_part[name]
|
||||
if self.lazy:
|
||||
data = LazyTorchTensor.from_eager(data)
|
||||
yield name, data
|
||||
|
||||
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
||||
@@ -373,6 +380,29 @@ class Model:
|
||||
except KeyError:
|
||||
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
||||
|
||||
def does_token_look_special(self, token: str | bytes) -> bool:
|
||||
if isinstance(token, (bytes, bytearray)):
|
||||
token_text = token.decode(encoding="utf-8")
|
||||
elif isinstance(token, memoryview):
|
||||
token_text = token.tobytes().decode(encoding="utf-8")
|
||||
else:
|
||||
token_text = token
|
||||
|
||||
# Some models mark some added tokens which ought to be control tokens as not special.
|
||||
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
|
||||
seems_special = token_text in (
|
||||
"<pad>", # deepseek-coder
|
||||
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
|
||||
)
|
||||
|
||||
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
|
||||
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
|
||||
|
||||
# TODO: should these be marked as UNUSED instead? (maybe not)
|
||||
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
|
||||
|
||||
return seems_special
|
||||
|
||||
# used for GPT-2 BPE and WordPiece vocabs
|
||||
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||
tokens: list[str] = []
|
||||
@@ -391,16 +421,18 @@ class Model:
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
return tokens, toktypes, tokpre
|
||||
|
||||
@@ -559,7 +591,7 @@ class Model:
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
@@ -609,7 +641,7 @@ class Model:
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
@@ -644,6 +676,25 @@ class Model:
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
||||
for token_id, token_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token: str = token_data["content"]
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
assert tokens[token_id] == token.encode("utf-8")
|
||||
if token_data.get("special") or self.does_token_look_special(token):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
else:
|
||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
|
||||
scores[token_id] = -1000.0
|
||||
tokens[token_id] = token.encode("utf-8")
|
||||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
@@ -1203,11 +1254,10 @@ class RefactModel(Model):
|
||||
|
||||
# TODO: how to determine special FIM tokens automatically?
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||
special_vocab._set_special_token("prefix", 1)
|
||||
special_vocab._set_special_token("suffix", 3)
|
||||
special_vocab._set_special_token("middle", 2)
|
||||
special_vocab._set_special_token("fsep", 4) # is this correct?
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
@@ -1267,7 +1317,7 @@ class StableLMModel(Model):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
else:
|
||||
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
|
||||
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
|
||||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
@@ -1579,7 +1629,6 @@ class DbrxModel(Model):
|
||||
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
||||
|
||||
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
||||
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
||||
@@ -1873,7 +1922,7 @@ class Phi3MiniModel(Model):
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
@@ -1918,7 +1967,7 @@ class Phi3MiniModel(Model):
|
||||
for token_id, foken_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token = foken_data["content"].encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
assert tokens[token_id] == token
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
@@ -1934,7 +1983,7 @@ class Phi3MiniModel(Model):
|
||||
for foken_data in added_tokens:
|
||||
token_id = int(foken_data["id"])
|
||||
token = foken_data["content"].encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
assert tokens[token_id] == token
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
@@ -2146,7 +2195,7 @@ class InternLM2Model(Model):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
# take care of ununsed raw token
|
||||
if piece.startswith('[UNUSED'):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
@@ -2176,7 +2225,7 @@ class InternLM2Model(Model):
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
@@ -2195,7 +2244,7 @@ class InternLM2Model(Model):
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
@@ -2222,13 +2271,6 @@ class InternLM2Model(Model):
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("InternLM2")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
@@ -2248,26 +2290,22 @@ class InternLM2Model(Model):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
num_kv_heads = self.hparams["num_key_value_heads"]
|
||||
hidden_size = self.hparams["hidden_size"]
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
q_per_kv = num_heads // num_kv_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
head_dim = n_embd // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
||||
|
||||
if re.match(qkv_pattern, name):
|
||||
bid = re.findall(qkv_pattern, name)[0]
|
||||
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
||||
qkv = data_torch
|
||||
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
||||
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
|
||||
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
||||
|
||||
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
|
||||
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
||||
|
||||
# The model weights of q and k equire additional reshape.
|
||||
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
|
||||
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
|
||||
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
|
||||
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
|
||||
# v = rearrange(v, " o g n i -> o (g n i)").T
|
||||
v = v.reshape((v.shape[0], -1)).T
|
||||
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
||||
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
||||
v = v.reshape((-1, v.shape[-1]))
|
||||
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
|
||||
@@ -2435,19 +2473,7 @@ class Gemma2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA2
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||
# hack: This is required so that we can properly use start/end-of-turn for chat template
|
||||
for i in range(108):
|
||||
# including <unusedX>, <start_of_turn>, <end_of_turn>
|
||||
toktypes[i] = SentencePieceTokenTypes.CONTROL
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
@@ -2474,11 +2500,6 @@ class Gemma2Model(Model):
|
||||
)
|
||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||
|
||||
# sanity check
|
||||
attn_scalar = self.hparams["query_pre_attn_scalar"]
|
||||
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
|
||||
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
@@ -2771,7 +2792,7 @@ class ArcticModel(Model):
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
@@ -3026,7 +3047,7 @@ class T5Model(Model):
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
@@ -3244,15 +3265,14 @@ class ChatGLMModel(Model):
|
||||
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
|
||||
score = tokenizer.tokenizer.sp_model.get_score(token_id)
|
||||
|
||||
if len(piece) == 0:
|
||||
text = f"[PAD{token_id}]".encode("utf-8")
|
||||
|
||||
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
|
||||
if piece in special_tokens:
|
||||
# show special tokens in prompt
|
||||
toktype = SentencePieceTokenTypes.USER_DEFINED
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif len(piece) == 0:
|
||||
text = f"[PAD{token_id}]".encode("utf-8")
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
else:
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
toktype = SentencePieceTokenTypes.USER_DEFINED
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
@@ -3341,7 +3361,7 @@ class ChatGLMModel(Model):
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
@@ -3411,19 +3431,46 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
# used for safetensors slices
|
||||
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||||
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||||
_dtype_str_map: dict[str, torch.dtype] = {
|
||||
"F64": torch.float64,
|
||||
"F32": torch.float32,
|
||||
"BF16": torch.bfloat16,
|
||||
"F16": torch.float16,
|
||||
# "U64": torch.uint64,
|
||||
"I64": torch.int64,
|
||||
# "U32": torch.uint32,
|
||||
"I32": torch.int32,
|
||||
# "U16": torch.uint16,
|
||||
"I16": torch.int16,
|
||||
"U8": torch.uint8,
|
||||
"I8": torch.int8,
|
||||
"BOOL": torch.bool,
|
||||
"F8_E4M3": torch.float8_e4m3fn,
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
lazy=self._lazy,
|
||||
args=(self,),
|
||||
func=(lambda s: s[0].numpy())
|
||||
func=(lambda s: s.numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||||
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||||
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
@@ -3434,7 +3481,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
@@ -3561,6 +3608,7 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split)
|
||||
|
||||
logger.info("Set model parameters")
|
||||
model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
|
||||
model_instance.set_gguf_parameters()
|
||||
|
||||
logger.info("Set model tokenizer")
|
||||
|
||||
Executable
+374
@@ -0,0 +1,374 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PartialLoraTensor:
|
||||
A: Tensor | None = None
|
||||
B: Tensor | None = None
|
||||
|
||||
|
||||
# magic to support tensor shape modifications and splitting
|
||||
class LoraTorchTensor:
|
||||
_lora_A: Tensor # (n_rank, row_size)
|
||||
_lora_B: Tensor # (col_size, n_rank)
|
||||
_rank: int
|
||||
|
||||
def __init__(self, A: Tensor, B: Tensor):
|
||||
assert len(A.shape) == len(B.shape)
|
||||
assert A.shape[-2] == B.shape[-1]
|
||||
if A.dtype != B.dtype:
|
||||
A = A.to(torch.float32)
|
||||
B = B.to(torch.float32)
|
||||
self._lora_A = A
|
||||
self._lora_B = B
|
||||
self._rank = B.shape[-1]
|
||||
|
||||
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
||||
return (self._lora_A, self._lora_B)
|
||||
|
||||
def __getitem__(
|
||||
self,
|
||||
indices: (
|
||||
SupportsIndex
|
||||
| slice
|
||||
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
|
||||
),
|
||||
) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
if isinstance(indices, SupportsIndex):
|
||||
if len(shape) > 2:
|
||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||
else:
|
||||
raise NotImplementedError # can't return a vector
|
||||
elif isinstance(indices, slice):
|
||||
if len(shape) > 2:
|
||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||
else:
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
|
||||
elif isinstance(indices, tuple):
|
||||
assert len(indices) > 0
|
||||
if indices[-1] is Ellipsis:
|
||||
return self[indices[:-1]]
|
||||
# expand ellipsis
|
||||
indices = tuple(
|
||||
u
|
||||
for v in (
|
||||
(
|
||||
(slice(None, None) for _ in range(len(indices) - 1))
|
||||
if i is Ellipsis
|
||||
else (i,)
|
||||
)
|
||||
for i in indices
|
||||
)
|
||||
for u in v
|
||||
)
|
||||
|
||||
if len(indices) < len(shape):
|
||||
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
|
||||
|
||||
# TODO: make sure this is correct
|
||||
indices_A = (
|
||||
*(
|
||||
(
|
||||
j.__index__() % self._lora_A.shape[i]
|
||||
if isinstance(j, SupportsIndex)
|
||||
else slice(None, None)
|
||||
)
|
||||
for i, j in enumerate(indices[:-2])
|
||||
),
|
||||
slice(None, None),
|
||||
indices[-1],
|
||||
)
|
||||
indices_B = indices[:-1]
|
||||
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
||||
else:
|
||||
raise NotImplementedError # unknown indice type
|
||||
|
||||
@property
|
||||
def dtype(self) -> torch.dtype:
|
||||
assert self._lora_A.dtype == self._lora_B.dtype
|
||||
return self._lora_A.dtype
|
||||
|
||||
@property
|
||||
def shape(self) -> tuple[int, ...]:
|
||||
assert len(self._lora_A.shape) == len(self._lora_B.shape)
|
||||
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
|
||||
|
||||
def size(self, dim=None):
|
||||
assert dim is None
|
||||
return self.shape
|
||||
|
||||
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||
if isinstance(shape[0], tuple):
|
||||
new_shape: tuple[int, ...] = shape[0]
|
||||
else:
|
||||
new_shape = cast(tuple[int, ...], shape)
|
||||
orig_shape = self.shape
|
||||
if len(new_shape) < 2:
|
||||
raise NotImplementedError # can't become a vector
|
||||
|
||||
# expand -1 in the shape
|
||||
if any(dim == -1 for dim in new_shape):
|
||||
n_elems = prod(orig_shape)
|
||||
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
||||
assert n_elems % n_new_elems == 0
|
||||
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
|
||||
|
||||
if new_shape[-1] != orig_shape[-1]:
|
||||
raise NotImplementedError # can't reshape the row size trivially
|
||||
|
||||
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
||||
shape_B = (*new_shape[:-1], self._rank)
|
||||
return LoraTorchTensor(
|
||||
self._lora_A.reshape(shape_A),
|
||||
self._lora_B.reshape(shape_B),
|
||||
)
|
||||
|
||||
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
|
||||
return self.reshape(*other.shape)
|
||||
|
||||
def view(self, *size: int) -> LoraTorchTensor:
|
||||
return self.reshape(*size)
|
||||
|
||||
def permute(self, *dims: int) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
||||
if dims[-1] == -1:
|
||||
# TODO: support higher dimensional A shapes bigger than 1
|
||||
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
||||
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
||||
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
|
||||
else:
|
||||
# TODO: compose the above two
|
||||
raise NotImplementedError
|
||||
|
||||
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
||||
shape = self.shape
|
||||
dims = [i for i in range(len(shape))]
|
||||
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
|
||||
return self.permute(*dims)
|
||||
|
||||
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
|
||||
return self.transpose(axis0, axis1)
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
if func is torch.permute:
|
||||
return type(args[0]).permute(*args, **kwargs)
|
||||
elif func is torch.reshape:
|
||||
return type(args[0]).reshape(*args, **kwargs)
|
||||
elif func is torch.stack:
|
||||
assert isinstance(args[0], Sequence)
|
||||
dim = kwargs.get("dim", 0)
|
||||
assert dim == 0
|
||||
return LoraTorchTensor(
|
||||
torch.stack([a._lora_A for a in args[0]], dim),
|
||||
torch.stack([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
elif func is torch.cat:
|
||||
assert isinstance(args[0], Sequence)
|
||||
dim = kwargs.get("dim", 0)
|
||||
assert dim == 0
|
||||
if len(args[0][0].shape) > 2:
|
||||
return LoraTorchTensor(
|
||||
torch.cat([a._lora_A for a in args[0]], dim),
|
||||
torch.cat([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
|
||||
return LoraTorchTensor(
|
||||
args[0][0]._lora_A,
|
||||
torch.cat([b._lora_B for b in args[0]], dim),
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||
return base_name
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
help="model is executed on big endian machine",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-lazy", action="store_true",
|
||||
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", action="store_true",
|
||||
help="increase output verbosity",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base", type=Path, required=True,
|
||||
help="directory containing base model file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing LoRA adapter file",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
ftype = ftype_map[args.outtype]
|
||||
|
||||
dir_base_model: Path = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
|
||||
|
||||
if os.path.exists(input_model):
|
||||
# lazy import load_file only if lora is in safetensors format.
|
||||
from safetensors.torch import load_file
|
||||
|
||||
lora_model = load_file(input_model, device="cpu")
|
||||
else:
|
||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||
|
||||
# load base model
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = Model.load_hparams(dir_base_model)
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
except NotImplementedError:
|
||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
class LoraModel(model_class):
|
||||
model_arch = model_class.model_arch
|
||||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
tensor_map: dict[str, PartialLoraTensor] = {}
|
||||
|
||||
for name, tensor in lora_model.items():
|
||||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
is_lora_a = ".lora_A.weight" in name
|
||||
is_lora_b = ".lora_B.weight" in name
|
||||
if not is_lora_a and not is_lora_b:
|
||||
if ".base_layer.weight" in name:
|
||||
continue
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
sys.exit(1)
|
||||
|
||||
if base_name in tensor_map:
|
||||
if is_lora_a:
|
||||
tensor_map[base_name].A = tensor
|
||||
else:
|
||||
tensor_map[base_name].B = tensor
|
||||
else:
|
||||
if is_lora_a:
|
||||
tensor_map[base_name] = PartialLoraTensor(A=tensor)
|
||||
else:
|
||||
tensor_map[base_name] = PartialLoraTensor(B=tensor)
|
||||
|
||||
for name, tensor in tensor_map.items():
|
||||
assert tensor.A is not None
|
||||
assert tensor.B is not None
|
||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = super().modify_tensors(data_torch, name, bid)
|
||||
for dest_name, dest_data in dest:
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
model_instance = LoraModel(
|
||||
dir_base_model,
|
||||
ftype,
|
||||
fname_out,
|
||||
is_big_endian=args.bigendian,
|
||||
use_temp_file=False,
|
||||
eager=args.no_lazy,
|
||||
model_name=None,
|
||||
)
|
||||
|
||||
with open(lora_config, "r") as f:
|
||||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
alpha = lparams["lora_alpha"]
|
||||
|
||||
model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER)
|
||||
model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||
model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
|
||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||
logger.info("Exporting model...")
|
||||
model_instance.write()
|
||||
logger.info(f"Model successfully exported to {model_instance.fname_out}")
|
||||
@@ -242,6 +242,45 @@ The following compilation options are also available to tweak performance (yes,
|
||||
|
||||
### Vulkan
|
||||
|
||||
**Windows**
|
||||
|
||||
#### w64devkit
|
||||
|
||||
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
|
||||
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
|
||||
|
||||
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
|
||||
```sh
|
||||
SDK_VERSION=1.3.283.0
|
||||
cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
|
||||
cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
|
||||
cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
|
||||
cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
|
||||
Name: Vulkan-Loader
|
||||
Description: Vulkan Loader
|
||||
Version: $SDK_VERSION
|
||||
Libs: -lvulkan-1
|
||||
EOF
|
||||
|
||||
```
|
||||
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
||||
|
||||
#### MSYS2
|
||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||
```sh
|
||||
pacman -S git \
|
||||
mingw-w64-ucrt-x86_64-gcc \
|
||||
mingw-w64-ucrt-x86_64-cmake \
|
||||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||
mingw-w64-ucrt-x86_64-shaderc
|
||||
```
|
||||
Switch into `llama.cpp` directory and build using CMake.
|
||||
```sh
|
||||
cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
**With docker**:
|
||||
|
||||
You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||
|
||||
@@ -9,15 +9,15 @@ Adding a model requires few steps:
|
||||
After following these steps, you can open PR.
|
||||
|
||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||
- [main](../examples/main)
|
||||
- [imatrix](../examples/imatrix)
|
||||
- [quantize](../examples/quantize)
|
||||
- [server](../examples/server)
|
||||
- [main](/examples/main/)
|
||||
- [imatrix](/examples/imatrix/)
|
||||
- [quantize](/examples/quantize/)
|
||||
- [server](/examples/server/)
|
||||
|
||||
### 1. Convert the model to GGUF
|
||||
|
||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
|
||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
|
||||
|
||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||
|
||||
@@ -31,7 +31,7 @@ class MyModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
```
|
||||
|
||||
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
|
||||
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
|
||||
|
||||
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
|
||||
|
||||
@@ -54,7 +54,7 @@ Example for `falcon` model:
|
||||
|
||||
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
|
||||
|
||||
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
|
||||
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
|
||||
|
||||
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
|
||||
|
||||
@@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||
|
||||
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
|
||||
|
||||
## GGUF specification
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Token generation performance troubleshooting
|
||||
|
||||
## Verifying that the model is running on the GPU with CUDA
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
@@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
|
||||
@@ -347,7 +347,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -384,7 +384,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
|
||||
char hex_result[41] = {0};
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -421,7 +421,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -460,7 +460,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -490,7 +490,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
|
||||
char hex_result[41];
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -520,7 +520,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
if (hash_params.manifest_is_usable) {
|
||||
@@ -552,7 +552,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
||||
generate_uuidv5(result, uuid);
|
||||
|
||||
char string_buffer[37] = {0};
|
||||
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
uuid[4], uuid[5], uuid[6], uuid[7],
|
||||
uuid[8], uuid[9], uuid[10], uuid[11],
|
||||
|
||||
@@ -289,8 +289,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
LOG_TEE("error: input is empty\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
|
||||
@@ -6,7 +6,7 @@ import re
|
||||
from copy import copy
|
||||
from enum import Enum
|
||||
from inspect import getdoc, isclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
|
||||
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
|
||||
|
||||
from docstring_parser import parse
|
||||
from pydantic import BaseModel, create_model
|
||||
@@ -53,35 +53,38 @@ class PydanticDataType(Enum):
|
||||
|
||||
|
||||
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
|
||||
if isclass(pydantic_type) and issubclass(pydantic_type, str):
|
||||
origin_type = get_origin(pydantic_type)
|
||||
origin_type = pydantic_type if origin_type is None else origin_type
|
||||
|
||||
if isclass(origin_type) and issubclass(origin_type, str):
|
||||
return PydanticDataType.STRING.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
|
||||
elif isclass(origin_type) and issubclass(origin_type, bool):
|
||||
return PydanticDataType.BOOLEAN.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, int):
|
||||
elif isclass(origin_type) and issubclass(origin_type, int):
|
||||
return PydanticDataType.INTEGER.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, float):
|
||||
elif isclass(origin_type) and issubclass(origin_type, float):
|
||||
return PydanticDataType.FLOAT.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
|
||||
elif isclass(origin_type) and issubclass(origin_type, Enum):
|
||||
return PydanticDataType.ENUM.value
|
||||
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
|
||||
return format_model_and_field_name(pydantic_type.__name__)
|
||||
elif get_origin(pydantic_type) is list:
|
||||
elif isclass(origin_type) and issubclass(origin_type, BaseModel):
|
||||
return format_model_and_field_name(origin_type.__name__)
|
||||
elif origin_type is list:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
|
||||
elif get_origin(pydantic_type) is set:
|
||||
elif origin_type is set:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
|
||||
elif get_origin(pydantic_type) is Union:
|
||||
elif origin_type is Union:
|
||||
union_types = get_args(pydantic_type)
|
||||
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
|
||||
return f"union-{'-or-'.join(union_rules)}"
|
||||
elif get_origin(pydantic_type) is Optional:
|
||||
elif origin_type is Optional:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
|
||||
elif isclass(pydantic_type):
|
||||
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
|
||||
elif get_origin(pydantic_type) is dict:
|
||||
elif isclass(origin_type):
|
||||
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
|
||||
elif origin_type is dict:
|
||||
key_type, value_type = get_args(pydantic_type)
|
||||
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
|
||||
else:
|
||||
@@ -118,7 +121,7 @@ def get_members_structure(cls, rule_name):
|
||||
# Modify this comprehension
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
|
||||
for name, param_type in cls.__annotations__.items()
|
||||
for name, param_type in get_type_hints(cls).items()
|
||||
if name != "self"
|
||||
]
|
||||
|
||||
@@ -297,17 +300,20 @@ def generate_gbnf_rule_for_type(
|
||||
field_name = format_model_and_field_name(field_name)
|
||||
gbnf_type = map_pydantic_type_to_gbnf(field_type)
|
||||
|
||||
if isclass(field_type) and issubclass(field_type, BaseModel):
|
||||
origin_type = get_origin(field_type)
|
||||
origin_type = field_type if origin_type is None else origin_type
|
||||
|
||||
if isclass(origin_type) and issubclass(origin_type, BaseModel):
|
||||
nested_model_name = format_model_and_field_name(field_type.__name__)
|
||||
nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
|
||||
rules.extend(nested_model_rules)
|
||||
gbnf_type, rules = nested_model_name, rules
|
||||
elif isclass(field_type) and issubclass(field_type, Enum):
|
||||
elif isclass(origin_type) and issubclass(origin_type, Enum):
|
||||
enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes
|
||||
enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
|
||||
rules.append(enum_rule)
|
||||
gbnf_type, rules = model_name + "-" + field_name, rules
|
||||
elif get_origin(field_type) == list: # Array
|
||||
elif origin_type is list: # Array
|
||||
element_type = get_args(field_type)[0]
|
||||
element_rule_name, additional_rules = generate_gbnf_rule_for_type(
|
||||
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
|
||||
@@ -317,7 +323,7 @@ def generate_gbnf_rule_for_type(
|
||||
rules.append(array_rule)
|
||||
gbnf_type, rules = model_name + "-" + field_name, rules
|
||||
|
||||
elif get_origin(field_type) == set or field_type == set: # Array
|
||||
elif origin_type is set: # Array
|
||||
element_type = get_args(field_type)[0]
|
||||
element_rule_name, additional_rules = generate_gbnf_rule_for_type(
|
||||
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
|
||||
@@ -371,7 +377,7 @@ def generate_gbnf_rule_for_type(
|
||||
gbnf_type = f"{model_name}-{field_name}-optional"
|
||||
else:
|
||||
gbnf_type = f"{model_name}-{field_name}-union"
|
||||
elif isclass(field_type) and issubclass(field_type, str):
|
||||
elif isclass(origin_type) and issubclass(origin_type, str):
|
||||
if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
|
||||
triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
|
||||
markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
|
||||
@@ -387,8 +393,8 @@ def generate_gbnf_rule_for_type(
|
||||
gbnf_type = PydanticDataType.STRING.value
|
||||
|
||||
elif (
|
||||
isclass(field_type)
|
||||
and issubclass(field_type, float)
|
||||
isclass(origin_type)
|
||||
and issubclass(origin_type, float)
|
||||
and field_info
|
||||
and hasattr(field_info, "json_schema_extra")
|
||||
and field_info.json_schema_extra is not None
|
||||
@@ -413,8 +419,8 @@ def generate_gbnf_rule_for_type(
|
||||
)
|
||||
|
||||
elif (
|
||||
isclass(field_type)
|
||||
and issubclass(field_type, int)
|
||||
isclass(origin_type)
|
||||
and issubclass(origin_type, int)
|
||||
and field_info
|
||||
and hasattr(field_info, "json_schema_extra")
|
||||
and field_info.json_schema_extra is not None
|
||||
@@ -462,7 +468,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
|
||||
if not issubclass(model, BaseModel):
|
||||
# For non-Pydantic classes, generate model_fields from __annotations__ or __init__
|
||||
if hasattr(model, "__annotations__") and model.__annotations__:
|
||||
model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} # pyright: ignore[reportGeneralTypeIssues]
|
||||
model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
|
||||
else:
|
||||
init_signature = inspect.signature(model.__init__)
|
||||
parameters = init_signature.parameters
|
||||
@@ -470,7 +476,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
|
||||
name != "self"}
|
||||
else:
|
||||
# For Pydantic models, use model_fields and check for ellipsis (required fields)
|
||||
model_fields = model.__annotations__
|
||||
model_fields = get_type_hints(model)
|
||||
|
||||
model_rule_parts = []
|
||||
nested_rules = []
|
||||
@@ -706,7 +712,7 @@ def generate_markdown_documentation(
|
||||
else:
|
||||
documentation += f" Fields:\n" # noqa: F541
|
||||
if isclass(model) and issubclass(model, BaseModel):
|
||||
for name, field_type in model.__annotations__.items():
|
||||
for name, field_type in get_type_hints(model).items():
|
||||
# if name == "markdown_code_block":
|
||||
# continue
|
||||
if get_origin(field_type) == list:
|
||||
@@ -754,14 +760,17 @@ def generate_field_markdown(
|
||||
field_info = model.model_fields.get(field_name)
|
||||
field_description = field_info.description if field_info and field_info.description else ""
|
||||
|
||||
if get_origin(field_type) == list:
|
||||
origin_type = get_origin(field_type)
|
||||
origin_type = field_type if origin_type is None else origin_type
|
||||
|
||||
if origin_type == list:
|
||||
element_type = get_args(field_type)[0]
|
||||
field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
|
||||
if field_description != "":
|
||||
field_text += ":\n"
|
||||
else:
|
||||
field_text += "\n"
|
||||
elif get_origin(field_type) == Union:
|
||||
elif origin_type == Union:
|
||||
element_types = get_args(field_type)
|
||||
types = []
|
||||
for element_type in element_types:
|
||||
@@ -792,9 +801,9 @@ def generate_field_markdown(
|
||||
example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
|
||||
field_text += f"{indent} Example: {example_text}\n"
|
||||
|
||||
if isclass(field_type) and issubclass(field_type, BaseModel):
|
||||
if isclass(origin_type) and issubclass(origin_type, BaseModel):
|
||||
field_text += f"{indent} Details:\n"
|
||||
for name, type_ in field_type.__annotations__.items():
|
||||
for name, type_ in get_type_hints(field_type).items():
|
||||
field_text += generate_field_markdown(name, type_, field_type, depth + 2)
|
||||
|
||||
return field_text
|
||||
@@ -855,7 +864,7 @@ def generate_text_documentation(
|
||||
|
||||
if isclass(model) and issubclass(model, BaseModel):
|
||||
documentation_fields = ""
|
||||
for name, field_type in model.__annotations__.items():
|
||||
for name, field_type in get_type_hints(model).items():
|
||||
# if name == "markdown_code_block":
|
||||
# continue
|
||||
if get_origin(field_type) == list:
|
||||
@@ -948,7 +957,7 @@ def generate_field_text(
|
||||
|
||||
if isclass(field_type) and issubclass(field_type, BaseModel):
|
||||
field_text += f"{indent} Details:\n"
|
||||
for name, type_ in field_type.__annotations__.items():
|
||||
for name, type_ in get_type_hints(field_type).items():
|
||||
field_text += generate_field_text(name, type_, field_type, depth + 2)
|
||||
|
||||
return field_text
|
||||
|
||||
@@ -20,6 +20,8 @@ def create_completion(prompt, grammar):
|
||||
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
|
||||
data = response.json()
|
||||
|
||||
assert data.get("error") is None, data
|
||||
|
||||
print(data["content"])
|
||||
return data["content"]
|
||||
|
||||
|
||||
@@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
|
||||
}
|
||||
|
||||
if (use_reference) {
|
||||
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
|
||||
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
|
||||
} else {
|
||||
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
|
||||
@@ -16,44 +16,44 @@ struct quant_option {
|
||||
};
|
||||
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||
|
||||
+273
-61
@@ -15,69 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
|
||||
**Command line options:**
|
||||
## Usage
|
||||
|
||||
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
|
||||
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
|
||||
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
|
||||
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
|
||||
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
|
||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
|
||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
|
||||
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
||||
- `--numa distribute`: Spread execution evenly over all nodes
|
||||
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
|
||||
- `--port`: Set the port to listen. Default: `8080`
|
||||
- `--path`: Path from which to serve static files. Default: disabled
|
||||
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
||||
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
|
||||
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512`
|
||||
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
||||
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
|
||||
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
|
||||
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
|
||||
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
|
||||
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
|
||||
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
|
||||
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
|
||||
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
|
||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
```
|
||||
usage: ./llama-server [options]
|
||||
|
||||
general:
|
||||
|
||||
-h, --help, --usage print usage and exit
|
||||
--version show version and build info
|
||||
-v, --verbose print verbose information
|
||||
--verbosity N set specific verbosity level (default: 0)
|
||||
--verbose-prompt print a verbose prompt before generation (default: false)
|
||||
--no-display-prompt don't print prompt at generation (default: false)
|
||||
-co, --color colorise output to distinguish prompt and user input from generations (default: false)
|
||||
-s, --seed SEED RNG seed (default: -1, use random seed for < 0)
|
||||
-t, --threads N number of threads to use during generation (default: 8)
|
||||
-tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
|
||||
-td, --threads-draft N number of threads to use during generation (default: same as --threads)
|
||||
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
|
||||
--draft N number of tokens to draft for speculative decoding (default: 5)
|
||||
-ps, --p-split N speculative decoding split probability (default: 0.1)
|
||||
-lcs, --lookup-cache-static FNAME
|
||||
path to static lookup cache to use for lookup decoding (not updated by generation)
|
||||
-lcd, --lookup-cache-dynamic FNAME
|
||||
path to dynamic lookup cache to use for lookup decoding (updated by generation)
|
||||
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
||||
-n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
|
||||
-b, --batch-size N logical maximum batch size (default: 2048)
|
||||
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
||||
--keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
|
||||
--chunks N max number of chunks to process (default: -1, -1 = all)
|
||||
-fa, --flash-attn enable Flash Attention (default: disabled)
|
||||
-p, --prompt PROMPT prompt to start generation with
|
||||
in conversation mode, this will be used as system prompt
|
||||
(default: '')
|
||||
-f, --file FNAME a file containing the prompt (default: none)
|
||||
--in-file FNAME an input file (repeat to specify multiple files)
|
||||
-bf, --binary-file FNAME binary file containing the prompt (default: none)
|
||||
-e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
|
||||
--no-escape do not process escape sequences
|
||||
-ptc, --print-token-count N print token count every N tokens (default: -1)
|
||||
--prompt-cache FNAME file to cache prompt state for faster startup (default: none)
|
||||
--prompt-cache-all if specified, saves user input and generations to cache as well
|
||||
not supported with --interactive or other interactive options
|
||||
--prompt-cache-ro if specified, uses the prompt cache but does not update it
|
||||
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
||||
can be specified more than once for multiple prompts
|
||||
-sp, --special special tokens output enabled (default: false)
|
||||
-cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
|
||||
if suffix/prefix are not specified, default chat template will be used
|
||||
(default: false)
|
||||
-i, --interactive run in interactive mode (default: false)
|
||||
-if, --interactive-first run in interactive mode and wait for input right away (default: false)
|
||||
-mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
|
||||
--in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
|
||||
--in-prefix STRING string to prefix user inputs with (default: empty)
|
||||
--in-suffix STRING string to suffix after user inputs with (default: empty)
|
||||
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
||||
|
||||
sampling:
|
||||
|
||||
--samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
|
||||
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
|
||||
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
|
||||
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
|
||||
--penalize-nl penalize newline tokens (default: false)
|
||||
--temp N temperature (default: 0.8)
|
||||
--top-k N top-k sampling (default: 40, 0 = disabled)
|
||||
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
|
||||
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
|
||||
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
|
||||
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
|
||||
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
|
||||
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
|
||||
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
|
||||
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
|
||||
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
|
||||
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
|
||||
--mirostat N use Mirostat sampling.
|
||||
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
|
||||
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
||||
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
|
||||
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
|
||||
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
||||
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
||||
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
||||
--cfg-negative-prompt PROMPT
|
||||
negative prompt to use for guidance (default: '')
|
||||
--cfg-negative-prompt-file FNAME
|
||||
negative prompt file to use for guidance
|
||||
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
|
||||
--chat-template JINJA_TEMPLATE
|
||||
set custom jinja chat template (default: template taken from model's metadata)
|
||||
if suffix/prefix are specified, template will be disabled
|
||||
only commonly used templates are accepted:
|
||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
|
||||
grammar:
|
||||
|
||||
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
|
||||
--grammar-file FNAME file to read grammar from
|
||||
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
|
||||
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
|
||||
|
||||
embedding:
|
||||
|
||||
--pooling {none,mean,cls,last}
|
||||
pooling type for embeddings, use model default if unspecified
|
||||
--attention {causal,non-causal}
|
||||
attention type for embeddings, use model default if unspecified
|
||||
|
||||
context hacking:
|
||||
|
||||
--rope-scaling {none,linear,yarn}
|
||||
RoPE frequency scaling method, defaults to linear unless specified by the model
|
||||
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
||||
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
|
||||
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
||||
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
|
||||
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
|
||||
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
|
||||
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
|
||||
-gan, --grp-attn-n N group-attention factor (default: 1)
|
||||
-gaw, --grp-attn-w N group-attention width (default: 512.0)
|
||||
-dkvc, --dump-kv-cache verbose print of the KV cache
|
||||
-nkvo, --no-kv-offload disable KV offload
|
||||
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
|
||||
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
|
||||
|
||||
perplexity:
|
||||
|
||||
--all-logits return logits for all tokens in the batch (default: false)
|
||||
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
|
||||
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
|
||||
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
|
||||
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
|
||||
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
|
||||
--multiple-choice-tasks N
|
||||
number of tasks to use when computing the multiple choice score (default: 0)
|
||||
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
|
||||
--ppl-stride N stride for perplexity calculation (default: 0)
|
||||
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
|
||||
|
||||
parallel:
|
||||
|
||||
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
|
||||
-np, --parallel N number of parallel sequences to decode (default: 1)
|
||||
-ns, --sequences N number of sequences to decode (default: 1)
|
||||
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
|
||||
|
||||
multi-modality:
|
||||
|
||||
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
|
||||
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
|
||||
|
||||
backend:
|
||||
|
||||
--rpc SERVERS comma separated list of RPC servers
|
||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
||||
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
|
||||
--numa TYPE attempt optimizations that help on some NUMA systems
|
||||
- distribute: spread execution evenly over all nodes
|
||||
- isolate: only spawn threads on CPUs on the node that execution started on
|
||||
- numactl: use the CPU map provided by numactl
|
||||
if run without this previously, it is recommended to drop the system page cache before using this
|
||||
see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
|
||||
model:
|
||||
|
||||
--check-tensors check model tensor data for invalid values (default: false)
|
||||
--override-kv KEY=TYPE:VALUE
|
||||
advanced option to override model metadata by key. may be specified multiple times.
|
||||
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
|
||||
--lora FNAME apply LoRA adapter (implies --no-mmap)
|
||||
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
|
||||
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
|
||||
--control-vector FNAME add a control vector
|
||||
note: this argument can be repeated to add multiple control vectors
|
||||
--control-vector-scaled FNAME SCALE
|
||||
add a control vector with user defined scaling SCALE
|
||||
note: this argument can be repeated to add multiple scaled control vectors
|
||||
--control-vector-layer-range START END
|
||||
layer range to apply the control vector(s) to, start and end inclusive
|
||||
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
|
||||
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
|
||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
||||
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
|
||||
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
|
||||
|
||||
retrieval:
|
||||
|
||||
--context-file FNAME file to load context from (repeat to specify multiple files)
|
||||
--chunk-size N minimum length of embedded text chunks (default: 64)
|
||||
--chunk-separator STRING
|
||||
separator between chunks (default: '
|
||||
')
|
||||
|
||||
passkey:
|
||||
|
||||
--junk N number of times to repeat the junk text (default: 250)
|
||||
--pos N position of the passkey in the junk text (default: -1)
|
||||
|
||||
imatrix:
|
||||
|
||||
-o, --output FNAME output file (default: 'imatrix.dat')
|
||||
--output-frequency N output the imatrix every N iterations (default: 10)
|
||||
--save-frequency N save an imatrix copy every N iterations (default: 0)
|
||||
--process-output collect data for the output tensor (default: false)
|
||||
--no-ppl do not compute perplexity (default: true)
|
||||
--chunk N start processing the input from chunk N (default: 0)
|
||||
|
||||
bench:
|
||||
|
||||
-pps is the prompt shared across parallel sequences (default: false)
|
||||
-npp n0,n1,... number of prompt tokens
|
||||
-ntg n0,n1,... number of text generation tokens
|
||||
-npl n0,n1,... number of parallel prompts
|
||||
|
||||
embedding:
|
||||
|
||||
--embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
--embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
--embd-separator separator of embendings (default \n) for example "<#sep#>"
|
||||
|
||||
server:
|
||||
|
||||
--host HOST ip address to listen (default: 127.0.0.1)
|
||||
--port PORT port to listen (default: 8080)
|
||||
--path PATH path to serve static files from (default: )
|
||||
--embedding(s) enable embedding endpoint (default: disabled)
|
||||
--api-key KEY API key to use for authentication (default: none)
|
||||
--api-key-file FNAME path to file containing API keys (default: none)
|
||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
||||
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
||||
--timeout N server read/write timeout in seconds (default: 600)
|
||||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
||||
--system-prompt-file FNAME
|
||||
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
|
||||
--log-format {text,json}
|
||||
log output format: json or text (default: json)
|
||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
||||
--no-slots disables slots monitoring endpoint (default: enabled)
|
||||
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
||||
--chat-template JINJA_TEMPLATE
|
||||
set custom jinja chat template (default: template taken from model's metadata)
|
||||
only commonly used templates are accepted:
|
||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
-sps, --slot-prompt-similarity SIMILARITY
|
||||
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
||||
|
||||
|
||||
logging:
|
||||
|
||||
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
|
||||
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
|
||||
--log-test Run simple logging test
|
||||
--log-disable Disable trace logs
|
||||
--log-enable Enable trace logs
|
||||
--log-file FNAME Specify a log filename (without extension)
|
||||
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
|
||||
--log-append Don't truncate the old log file.
|
||||
|
||||
cvector:
|
||||
|
||||
-o, --output FNAME output file (default: 'control_vector.gguf')
|
||||
--positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
|
||||
--negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
|
||||
--pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
|
||||
--pca-iter N number of iterations used for PCA (default: 1000)
|
||||
--method {pca,mean} dimensionality reduction method to be used (default: pca)
|
||||
```
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate
|
||||
|
||||
## Build
|
||||
|
||||
|
||||
@@ -737,6 +737,8 @@ struct server_context {
|
||||
slot.ga_n = ga_n;
|
||||
slot.ga_w = ga_w;
|
||||
|
||||
slot.sparams = params.sparams;
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
@@ -2003,6 +2005,11 @@ struct server_context {
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||
|
||||
// track if this is an embedding or non-embedding batch
|
||||
// if we've added sampled tokens above, we are in non-embedding mode
|
||||
// -1: none, 0: non-embedding, 1: embedding
|
||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & slot : slots) {
|
||||
@@ -2173,6 +2180,14 @@ struct server_context {
|
||||
}
|
||||
}
|
||||
|
||||
// check that we are in the right batch_type, if not defer the slot
|
||||
bool slot_type = slot.embedding ? 1 : 0;
|
||||
if (batch_type == -1) {
|
||||
batch_type = slot_type;
|
||||
} else if (batch_type != slot_type) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep only the common part
|
||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
||||
@@ -2274,6 +2289,9 @@ struct server_context {
|
||||
{"n_tokens", batch.n_tokens},
|
||||
});
|
||||
|
||||
// make sure we're in the right embedding mode
|
||||
llama_set_embeddings(ctx, batch_type == 1);
|
||||
|
||||
// process the created batch of tokens
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
@@ -2988,6 +3006,11 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
json data = json::parse(req.body);
|
||||
@@ -3083,6 +3106,11 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
|
||||
@@ -3155,6 +3183,11 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
json data = json::parse(req.body);
|
||||
@@ -3241,13 +3274,8 @@ int main(int argc, char ** argv) {
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
};
|
||||
|
||||
const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
if (!params.embedding) {
|
||||
res.status = 501;
|
||||
res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
|
||||
return;
|
||||
}
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
bool is_openai = false;
|
||||
|
||||
@@ -122,8 +122,26 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
const auto & curr_msg = messages[i];
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
|
||||
std::string content;
|
||||
if (curr_msg.contains("content")) {
|
||||
if (curr_msg["content"].is_string()) {
|
||||
content = curr_msg["content"].get<std::string>();
|
||||
} else if (curr_msg["content"].is_array()) {
|
||||
for (const auto & part : curr_msg["content"]) {
|
||||
if (part.contains("text")) {
|
||||
content += "\n" + part["text"].get<std::string>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
}
|
||||
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
||||
}
|
||||
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
// variables where to put any arguments we see.
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool no_parse_special = false;
|
||||
bool disable_logging = false;
|
||||
bool show_token_count = false;
|
||||
const char * model_path = NULL;
|
||||
@@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
else if (arg == "--no-bos") {
|
||||
no_bos = true;
|
||||
}
|
||||
else if (arg == "--no-parse-special") {
|
||||
no_parse_special = true;
|
||||
}
|
||||
else if (arg == "-p" || arg == "--prompt") {
|
||||
if (prompt_set) {
|
||||
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||
@@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
|
||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||
const bool parse_special = !no_parse_special;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
|
||||
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1720031269,
|
||||
"narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=",
|
||||
"lastModified": 1720768451,
|
||||
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9f4128e00b0ae8ec65918efeba59db998750ead6",
|
||||
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
src/ggml-vulkan-shaders.hpp
|
||||
src/ggml-vulkan-shaders.cpp
|
||||
@@ -1,220 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
logger = logging.getLogger("ggml-vk-generate-shaders")
|
||||
|
||||
GLSLC = "glslc"
|
||||
|
||||
type_names = [
|
||||
"f32",
|
||||
"f16",
|
||||
"q4_0",
|
||||
"q4_1",
|
||||
"q5_0",
|
||||
"q5_1",
|
||||
"q8_0",
|
||||
"q2_k",
|
||||
"q3_k",
|
||||
"q4_k",
|
||||
"q5_k",
|
||||
"q6_k",
|
||||
]
|
||||
|
||||
ASYNCIO_CONCURRENCY = 64
|
||||
|
||||
input_dir = "vulkan-shaders"
|
||||
output_dir = gettempdir()
|
||||
|
||||
lock = asyncio.Lock()
|
||||
shader_fnames = []
|
||||
|
||||
|
||||
async def string_to_spv(name, in_fname, defines, fp16=True):
|
||||
name = f"{name}{'_fp32' if not fp16 else ''}"
|
||||
out_fname = os.path.join(output_dir, f"{name}.spv")
|
||||
|
||||
in_path = os.path.join(input_dir, in_fname)
|
||||
|
||||
cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
|
||||
|
||||
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
|
||||
|
||||
stdout, stderr = await proc.communicate()
|
||||
|
||||
stdout = stdout.decode()
|
||||
error = stderr.decode()
|
||||
|
||||
if proc.returncode:
|
||||
cmd = " ".join(cmd)
|
||||
logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
|
||||
return
|
||||
|
||||
async with lock:
|
||||
shader_fnames.append((name, out_fname))
|
||||
|
||||
|
||||
def matmul_shaders(tasks, fp16, matmul_id):
|
||||
if fp16:
|
||||
load_vec = "8"
|
||||
aligned_b_type_f32 = "mat2x4"
|
||||
aligned_b_type_f16 = "f16mat2x4"
|
||||
else:
|
||||
load_vec = "4"
|
||||
aligned_b_type_f32 = "vec4"
|
||||
aligned_b_type_f16 = "f16vec4"
|
||||
|
||||
base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
|
||||
shader_name = "matmul"
|
||||
|
||||
if matmul_id:
|
||||
base_dict["MUL_MAT_ID"] = "1"
|
||||
shader_name = "matmul_id"
|
||||
|
||||
if fp16:
|
||||
base_dict["FLOAT16"] = "1"
|
||||
|
||||
# Shaders with f16 B_TYPE
|
||||
tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
for tname in type_names:
|
||||
data_a_key = f"DATA_A_{tname.upper()}"
|
||||
load_vec_a = load_vec if tname in ("f32", "f16") else "2"
|
||||
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
|
||||
|
||||
tasks = []
|
||||
|
||||
base_dict = {"FLOAT_TYPE": "float"}
|
||||
|
||||
for fp16 in (False, True):
|
||||
# MUL_MAT
|
||||
matmul_shaders(tasks, fp16, False)
|
||||
# MUL_MAT_ID
|
||||
matmul_shaders(tasks, fp16, True)
|
||||
|
||||
for tname in type_names:
|
||||
# mul mat vec
|
||||
data_a_key = f"DATA_A_{tname.upper()}"
|
||||
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Dequant shaders
|
||||
if tname != "f16":
|
||||
tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
|
||||
|
||||
# get_rows
|
||||
if not tname.endswith("_k"):
|
||||
shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
|
||||
|
||||
if tname == "f16":
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
|
||||
else:
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Norms
|
||||
tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
|
||||
|
||||
tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
|
||||
|
||||
tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Helper to decorate tasks with semaphore acquisition.
|
||||
async def withSemaphore(sem, task):
|
||||
async with sem:
|
||||
return await task
|
||||
|
||||
# Run tasks concurrently guarded by a concurrency limit.
|
||||
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
|
||||
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
|
||||
|
||||
with open("ggml-vulkan-shaders.hpp", "w") as f:
|
||||
f.write("#include <cstdint>\n\n")
|
||||
for name, path in sorted(shader_fnames):
|
||||
|
||||
with open(path, "rb") as spv:
|
||||
counter = 0
|
||||
newline_counter = 0
|
||||
f.write(f"unsigned char {name}_data[] = {{\n")
|
||||
for val in spv.read():
|
||||
f.write(f"0x{val:02x},")
|
||||
newline_counter += 1
|
||||
counter += 1
|
||||
if newline_counter >= 12:
|
||||
newline_counter = 0
|
||||
f.write("\n")
|
||||
f.write("\n};\n")
|
||||
f.write(f"const uint64_t {name}_len = {counter};\n\n")
|
||||
os.remove(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
|
||||
|
||||
parser.add_argument("--glslc", help="Path to glslc")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if args.glslc:
|
||||
GLSLC = args.glslc
|
||||
|
||||
asyncio.run(main())
|
||||
+25
-25
@@ -714,9 +714,9 @@ extern "C" {
|
||||
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
|
||||
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
@@ -2410,31 +2410,31 @@ extern "C" {
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
|
||||
int64_t k, int64_t bx);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
typedef struct {
|
||||
const char * type_name;
|
||||
int blck_size;
|
||||
size_t type_size;
|
||||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_reference;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously;
|
||||
int64_t ncols; // number of columns to process simultaneously;
|
||||
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
|
||||
const char * type_name;
|
||||
int64_t blck_size;
|
||||
int64_t blck_size_interleave; // interleave elements in blocks
|
||||
size_t type_size;
|
||||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
} ggml_type_traits_t;
|
||||
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
|
||||
+32
-5
@@ -527,14 +527,11 @@ if (GGML_RPC)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan)
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
|
||||
|
||||
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||
@@ -563,7 +560,37 @@ if (GGML_VULKAN)
|
||||
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
|
||||
add_subdirectory(vulkan-shaders)
|
||||
|
||||
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
|
||||
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
|
||||
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
|
||||
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
|
||||
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
|
||||
|
||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${_ggml_vk_header}
|
||||
${_ggml_vk_source}
|
||||
|
||||
COMMAND ${_ggml_vk_genshaders_cmd}
|
||||
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
||||
--input-dir ${_ggml_vk_input_dir}
|
||||
--output-dir ${_ggml_vk_output_dir}
|
||||
--target-hpp ${_ggml_vk_header}
|
||||
--target-cpp ${_ggml_vk_source}
|
||||
--no-clean
|
||||
|
||||
DEPENDS ${_ggml_vk_shader_deps}
|
||||
COMMENT "Generate vulkan shaders"
|
||||
)
|
||||
|
||||
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
endif()
|
||||
|
||||
+38
-32
@@ -14,25 +14,27 @@
|
||||
|
||||
#include "ggml-aarch64.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// Functions to create the interleaved data layout formats
|
||||
|
||||
// interleave 4 block_q4_0s in blocks of interleave_blcksize
|
||||
// interleave 4 block_q4_0s in blocks of blck_size_interleave
|
||||
// returns an interleaved block_q4_0x4
|
||||
// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
|
||||
// first, then interleave quants from 4 block_q4_0s in blocks of interleave_blcksize
|
||||
// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
|
||||
//
|
||||
// - in : an array of block_q4_0 pointers
|
||||
// - interleave_blcksize : the block_q4_0 quants bytes are interleaved in blocks of
|
||||
// interleave_blcksize bytes
|
||||
// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
|
||||
// blck_size_interleave bytes
|
||||
// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
|
||||
// from bias offset form to pure sign form (this saves subtract
|
||||
// operations durin unpacking)
|
||||
//
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
@@ -40,9 +42,9 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_b
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 2; i++) {
|
||||
int src_offset = (i / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (i % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (i % interleave_blcksize);
|
||||
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
}
|
||||
@@ -50,11 +52,11 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int interleave_b
|
||||
return out;
|
||||
}
|
||||
|
||||
// interleave 8 block_q4_0s in blocks of interleave_blcksize
|
||||
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
||||
// returns an interleaved block_q4_0x8
|
||||
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of interleave_blcksize
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_blcksize, unsigned int xor_mask) {
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||
block_q4_0x8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
@@ -62,9 +64,9 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int interleave_b
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK4_0 * 4; i++) {
|
||||
int src_offset = (i / (8 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (i % (8 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (i % interleave_blcksize);
|
||||
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (i % blck_size_interleave);
|
||||
|
||||
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
|
||||
}
|
||||
@@ -135,7 +137,7 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
const int interleave_blcksize = 4;
|
||||
const int blck_size_interleave = 4;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
@@ -155,12 +157,12 @@ void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k)
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (j % interleave_blcksize);
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);;
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -253,7 +255,7 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
const int interleave_blcksize = 8;
|
||||
const int blck_size_interleave = 8;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
@@ -273,26 +275,30 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * interleave_blcksize)) * interleave_blcksize;
|
||||
int src_id = (j % (4 * interleave_blcksize)) / interleave_blcksize;
|
||||
src_offset += (j % interleave_blcksize);
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);;
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t interleave_blcksize) {
|
||||
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
|
||||
assert(nrow == 4);
|
||||
UNUSED(nrow);
|
||||
if (interleave_blcksize == 4) quantize_q8_0_4x4(x, vy, n_per_row);
|
||||
else if (interleave_blcksize == 8) quantize_q8_0_4x8(x, vy, n_per_row);
|
||||
else assert(false);
|
||||
if (blck_size_interleave == 4) {
|
||||
quantize_q8_0_4x4(x, vy, n_per_row);
|
||||
} else if (blck_size_interleave == 8) {
|
||||
quantize_q8_0_4x8(x, vy, n_per_row);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int interleave_blcksize) {
|
||||
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
||||
assert(n_per_row % QK4_0 == 0);
|
||||
const int nb = n_per_row / QK4_0;
|
||||
|
||||
@@ -311,15 +317,15 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
|
||||
for (int64_t x = 0; x < nb; x++) {
|
||||
|
||||
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||
quantize_row_q4_0_reference(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
||||
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
||||
}
|
||||
|
||||
if (nrows_interleaved == 8) {
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, interleave_blcksize, 0x88);
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
|
||||
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
||||
}
|
||||
else if (nrows_interleaved == 4) {
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, interleave_blcksize, 0x88);
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
|
||||
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ extern "C" {
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
@@ -24,14 +24,14 @@ size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT d
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -394,7 +394,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
||||
|
||||
// backend registry
|
||||
|
||||
#define GGML_REG_MAX_BACKENDS 16
|
||||
#define GGML_REG_MAX_BACKENDS 64
|
||||
|
||||
struct ggml_backend_reg {
|
||||
char name[128];
|
||||
|
||||
@@ -8,11 +8,12 @@
|
||||
# include <Accelerate/Accelerate.h>
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
# include <mkl.h>
|
||||
#elif defined(GGML_BLAS_USE_BLIS)
|
||||
# include <blis.h>
|
||||
#elif defined(GGML_BLAS_USE_NVPL)
|
||||
# include <nvpl_blas.h>
|
||||
#else
|
||||
# include <cblas.h>
|
||||
# ifdef BLIS_ENABLE_CBLAS
|
||||
# include <blis.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
struct ggml_backend_blas_context {
|
||||
@@ -140,10 +141,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
||||
openblas_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_CBLAS)
|
||||
#if defined(GGML_BLAS_USE_BLIS)
|
||||
bli_thread_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
#if defined(GGML_BLAS_USE_NVPL)
|
||||
nvpl_blas_set_num_threads(ctx->n_threads);
|
||||
#endif
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const int64_t i03 = i13/r3;
|
||||
|
||||
@@ -1876,7 +1876,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
|
||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
||||
&& src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
|
||||
@@ -104,7 +104,7 @@
|
||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap abort
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||
|
||||
@@ -70,6 +70,10 @@ struct mma_int_A_I16K8 {
|
||||
}
|
||||
#endif // defined(INT8_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void load_low(const int * __restrict__ xs0, const int & stride) {
|
||||
((mma_int_A_I16K4 *) x)[0].load(xs0, stride);
|
||||
}
|
||||
};
|
||||
|
||||
struct mma_int_B_J8K4 {
|
||||
|
||||
+727
-638
File diff suppressed because it is too large
Load Diff
@@ -37,47 +37,92 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
template <bool need_sum>
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
|
||||
|
||||
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
|
||||
if (ix0 >= kx0_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
|
||||
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
|
||||
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
|
||||
|
||||
const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
|
||||
float amax = fabsf(xi);
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
float amax = fabsf(xi.x);
|
||||
amax = fmaxf(amax, fabsf(xi.y));
|
||||
amax = fmaxf(amax, fabsf(xi.z));
|
||||
amax = fmaxf(amax, fabsf(xi.w));
|
||||
|
||||
amax = warp_reduce_max(amax);
|
||||
|
||||
float sum;
|
||||
if (need_sum) {
|
||||
sum = warp_reduce_sum(xi);
|
||||
// Exchange max. abs. value between vals_per_scale/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) {
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
|
||||
}
|
||||
|
||||
const float d = amax / 127;
|
||||
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||
float sum;
|
||||
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
|
||||
sum = xi.x + xi.y + xi.z + xi.w;
|
||||
|
||||
y[ib].qs[iqs] = q;
|
||||
// Exchange calculate sum across vals_per_sum/4 threads.
|
||||
#pragma unroll
|
||||
for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) {
|
||||
sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
const float d_inv = 127.0f / amax;
|
||||
char4 q;
|
||||
q.x = roundf(xi.x*d_inv);
|
||||
q.y = roundf(xi.y*d_inv);
|
||||
q.z = roundf(xi.z*d_inv);
|
||||
q.w = roundf(xi.w*d_inv);
|
||||
|
||||
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
|
||||
char4 * yqs4 = (char4 *) y[ib].qs;
|
||||
yqs4[iqs/4] = q;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) {
|
||||
if (iqs % 16 != 0 || iqs >= 96) {
|
||||
return;
|
||||
}
|
||||
|
||||
y[ib].d2s6[2 + iqs/16] = sum;
|
||||
|
||||
if (iqs % 64 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
y[ib].d2s6[iqs/64] = d;
|
||||
|
||||
if (iqs % QK8_1 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (need_sum) {
|
||||
y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
|
||||
if (iqs % 32 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
|
||||
y[ib].ds4[iqs/32] = make_half2(d, sum);
|
||||
} else {
|
||||
((float *) y[ib].ds)[iqs/QK8_1] = d;
|
||||
y[ib].d4[iqs/32] = d;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,12 +146,24 @@ void quantize_mmq_q8_1_cuda(
|
||||
|
||||
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, kx1, channels);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
if (mmq_need_sum(type_x)) {
|
||||
quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
} else {
|
||||
quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_x)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_DS4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_D2S6:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,11 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
||||
|
||||
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
||||
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
||||
|
||||
typedef void (*quantize_cuda_t)(
|
||||
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
|
||||
|
||||
@@ -189,7 +189,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
||||
}
|
||||
|
||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||
#define VDR_Q2_K_Q8_1_MMQ 2
|
||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||
|
||||
// contiguous v/x values
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
||||
@@ -219,32 +219,56 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
||||
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
template <int ns8>
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
|
||||
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8, const half2 * s8) {
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
float sumf = 0.0f;
|
||||
float sumf_d8 = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
||||
const float2 dm2f = __half22float2(dm2[i0/(QI8_1/2)]);
|
||||
int sumi_d = 0;
|
||||
int sumi_m = 0;
|
||||
for (int i0 = 0; i0 < QR2_K*VDR_Q2_K_Q8_1_MMQ; i0 += QI8_1) {
|
||||
const float2 dm2f0 = __half22float2(dm2[i0/(QI8_1/2) + 0]);
|
||||
int sumi_d0 = 0;
|
||||
|
||||
const float2 dm2f1 = __half22float2(dm2[i0/(QI8_1/2) + 1]);
|
||||
int sumi_d1 = 0;
|
||||
|
||||
const int vi0 = v[i0/(QI8_1/2)];
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
|
||||
sumi_d = ggml_cuda_dp4a(vi, u[i], sumi_d); // SIMD dot product
|
||||
sumi_m = ggml_cuda_dp4a(0x01010101, u[i], sumi_m);
|
||||
sumi_d0 = ggml_cuda_dp4a(v[i], u[i], sumi_d0);
|
||||
}
|
||||
sumf_d8 += dm2f0.x * sumi_d0;
|
||||
|
||||
sumf_d += dm2f.x * sumi_d;
|
||||
sumf_m += dm2f.y * sumi_m;
|
||||
#pragma unroll
|
||||
for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
|
||||
sumi_d1 = ggml_cuda_dp4a(v[i], u[i], sumi_d1);
|
||||
}
|
||||
sumf_d8 += dm2f1.x * sumi_d1;
|
||||
|
||||
if (i0/QI8_1 < ns8) {
|
||||
const float2 s8f = __half22float2(s8[i0/QI8_1]);
|
||||
sumf -= dm2f0.y*s8f.x;
|
||||
sumf -= dm2f1.y*s8f.y;
|
||||
} else {
|
||||
int sumi_m0 = 0;
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
sumi_m0 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m0);
|
||||
}
|
||||
sumf_d8 -= dm2f0.y * sumi_m0;
|
||||
|
||||
int sumi_m1 = 0;
|
||||
#pragma unroll
|
||||
for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
|
||||
sumi_m1 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m1);
|
||||
}
|
||||
sumf_d8 -= dm2f1.y * sumi_m1;
|
||||
}
|
||||
}
|
||||
|
||||
return d8*(sumf_d - sumf_m);
|
||||
return sumf + d8*sumf_d8;
|
||||
}
|
||||
|
||||
#define VDR_Q3_K_Q8_1_MMVQ 1
|
||||
@@ -283,7 +307,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
||||
return d3 * sumf;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
||||
const float & d3, const float & d8) {
|
||||
@@ -296,8 +320,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
||||
|
||||
#pragma unroll
|
||||
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
||||
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
|
||||
sumi_sc = ggml_cuda_dp4a(vi, u[i], sumi_sc); // SIMD dot product
|
||||
sumi_sc = ggml_cuda_dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
||||
}
|
||||
|
||||
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
||||
@@ -334,7 +357,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
@@ -397,7 +420,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
||||
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
@@ -451,13 +474,16 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
||||
return d*sumf;
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
// contiguous v/x + u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
||||
const float & d6, const float * __restrict__ d8) {
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
|
||||
const int sc_packed = get_int_b4(sc, 0);
|
||||
const int8_t * sc_reg = (const int8_t *) &sc_packed;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
||||
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
||||
@@ -471,7 +497,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
||||
sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
||||
}
|
||||
|
||||
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
||||
sumf_d += d8[i0/4] * (sc_reg[i0/2+0]*sumi_d.x + sc_reg[i0/2+1]*sumi_d.y);
|
||||
}
|
||||
|
||||
return d6 * sumf_d;
|
||||
|
||||
+14
-14
@@ -193,16 +193,16 @@ enum ggml_metal_kernel_type {
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CONCAT,
|
||||
GGML_METAL_KERNEL_TYPE_SQR,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
@@ -651,14 +651,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
@@ -810,8 +810,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -824,8 +824,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -837,7 +837,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
|
||||
return op->ne[3] == 1;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
@@ -1580,8 +1580,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// some Metal matrix data types require aligned pointers
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
|
||||
case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
|
||||
case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
@@ -2775,8 +2775,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
|
||||
|
||||
switch (dstt) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
|
||||
@@ -2789,8 +2789,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
switch (dstt) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
};
|
||||
} break;
|
||||
|
||||
+176
-547
@@ -1219,9 +1219,10 @@ kernel void kernel_mul_mv_q8_0_f32(
|
||||
kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
#define N_F32_F32 4
|
||||
#define N_MV_T_T 4
|
||||
|
||||
void kernel_mul_mv_f32_f32_impl(
|
||||
template<typename T0, typename T04, typename T1, typename T14>
|
||||
void kernel_mul_mv_impl(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1239,13 +1240,12 @@ void kernel_mul_mv_f32_f32_impl(
|
||||
uint64_t nb12,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
uint r2,
|
||||
uint r3,
|
||||
uint3 tgpig,
|
||||
uint tiisg) {
|
||||
|
||||
uint r2,
|
||||
uint r3,
|
||||
uint3 tgpig,
|
||||
uint tiisg) {
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t rb = tgpig.y*N_F32_F32;
|
||||
const int64_t rb = tgpig.y*N_MV_T_T;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
@@ -1253,20 +1253,20 @@ void kernel_mul_mv_f32_f32_impl(
|
||||
|
||||
const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
|
||||
|
||||
device const float * x = (device const float *) (src0 + offset0);
|
||||
device const T0 * x = (device const T0 *) (src0 + offset0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
for (int row = 0; row < N_MV_T_T; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const T1 * y = (device const T1 *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
sumf += (T0) x[i] * (T1) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
@@ -1275,32 +1275,32 @@ void kernel_mul_mv_f32_f32_impl(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const float4 * x4 = (device const float4 *)x;
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
device const T04 * x4 = (device const T04 *) x;
|
||||
for (int row = 0; row < N_MV_T_T; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
device const T1 * y = (device const T1 *) (src1 + r1*nb11 + im*nb12);
|
||||
device const T14 * y4 = (device const T14 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) (x[i] * y[i]);
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_f32_f32")]]
|
||||
kernel void kernel_mul_mv_f32_f32(
|
||||
template<typename T0, typename T04, typename T1, typename T14>
|
||||
kernel void kernel_mul_mv(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1322,90 +1322,38 @@ kernel void kernel_mul_mv_f32_f32(
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
kernel_mul_mv_impl<T0, T04, T1, T14>(
|
||||
src0,
|
||||
src1,
|
||||
dst,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
nb00,
|
||||
nb01,
|
||||
nb02,
|
||||
ne10,
|
||||
ne11,
|
||||
ne12,
|
||||
nb10,
|
||||
nb11,
|
||||
nb12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
tgpig,
|
||||
tiisg);
|
||||
}
|
||||
|
||||
#define N_F16_F16 4
|
||||
typedef decltype(kernel_mul_mv<half, half4, half, half4>) mul_mv_t;
|
||||
|
||||
kernel void kernel_mul_mv_f16_f16(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t kernel_mul_mv<float, float4, float, float4>;
|
||||
template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t kernel_mul_mv<half, half4, float, float4>;
|
||||
template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t kernel_mul_mv<half, half4, half, half4>;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t rb = tgpig.y*N_F16_F16;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
|
||||
|
||||
device const half * x = (device const half *) (src0 + offset0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F16_F16; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (half) x[i] * (half) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const half4 * x4 = (device const half4 *)x;
|
||||
for (int row = 0; row < N_F16_F16; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
|
||||
device const half4 * y4 = (device const half4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_mul_mv_f16_f32_1row_impl(
|
||||
template<typename T, typename T4>
|
||||
kernel void kernel_mul_mv_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1437,7 +1385,7 @@ void kernel_mul_mv_f16_f32_1row_impl(
|
||||
|
||||
const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
|
||||
|
||||
device const half * x = (device const half *) (src0 + offset0);
|
||||
device const T * x = (device const T *) (src0 + offset0);
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
@@ -1450,153 +1398,29 @@ void kernel_mul_mv_f16_f32_1row_impl(
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
} else {
|
||||
device const half4 * x4 = (device const half4 *) x;
|
||||
device const T4 * x4 = (device const T4 *) x;
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) (x[i] * y[i]);
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_f16_f32_1row")]]
|
||||
kernel void kernel_mul_mv_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
}
|
||||
typedef decltype(kernel_mul_mv_1row<half, half4>) mul_mv_1row_t;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void kernel_mul_mv_f16_f32_impl(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
uint64_t nb00,
|
||||
uint64_t nb01,
|
||||
uint64_t nb02,
|
||||
int64_t ne10,
|
||||
int64_t ne11,
|
||||
int64_t ne12,
|
||||
uint64_t nb10,
|
||||
uint64_t nb11,
|
||||
uint64_t nb12,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
uint r2,
|
||||
uint r3,
|
||||
uint3 tgpig,
|
||||
uint tiisg) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t rb = tgpig.y*N_F16_F32;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
|
||||
|
||||
device const half * x = (device const half *) (src0 + offset0);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const half4 * x4 = (device const half4 *)x;
|
||||
for (int row = 0; row < N_F16_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_f16_f32")]]
|
||||
kernel void kernel_mul_mv_f16_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
|
||||
}
|
||||
template [[host_name("kernel_mul_mv_f16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row<half, half4>;
|
||||
|
||||
// Assumes row size (ne00) is a multiple of 4
|
||||
kernel void kernel_mul_mv_f16_f32_l4(
|
||||
template<typename T, typename T4>
|
||||
kernel void kernel_mul_mv_l4(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1628,14 +1452,14 @@ kernel void kernel_mul_mv_f16_f32_l4(
|
||||
|
||||
const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
|
||||
|
||||
device const half4 * x4 = (device const half4 *) (src0 + offset0);
|
||||
device const T4 * x4 = (device const T4 *) (src0 + offset0);
|
||||
|
||||
for (int r1 = 0; r1 < nrows; ++r1) {
|
||||
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) (x4[i][k] * y4[i][k]);
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
@@ -1645,6 +1469,10 @@ kernel void kernel_mul_mv_f16_f32_l4(
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_mv_l4<half, half4>) mul_mv_l4_t;
|
||||
|
||||
template [[host_name("kernel_mul_mv_f16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4<half, half4>;
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
@@ -2765,9 +2593,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
||||
//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
device half * dst,
|
||||
template<typename T0, typename T1>
|
||||
kernel void kernel_cpy(
|
||||
device const void * src0,
|
||||
device void * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -2798,138 +2627,20 @@ kernel void kernel_cpy_f16_f16(
|
||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
device T1 * dst_data = (device T1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||
device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
device const T0 * src = (device T0 *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = (T1) src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f16_f32(
|
||||
device const half * src0,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = tgpig[2];
|
||||
const int64_t i02 = tgpig[1];
|
||||
const int64_t i01 = tgpig[0];
|
||||
typedef decltype(kernel_cpy<float, float>) kernel_cpy_t;
|
||||
|
||||
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||
device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f16(
|
||||
device const float * src0,
|
||||
device half * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = tgpig[2];
|
||||
const int64_t i02 = tgpig[1];
|
||||
const int64_t i01 = tgpig[0];
|
||||
|
||||
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cpy_f32_f32(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i03 = tgpig[2];
|
||||
const int64_t i02 = tgpig[1];
|
||||
const int64_t i01 = tgpig[0];
|
||||
|
||||
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
const int64_t i3 = n / (ne2*ne1*ne0);
|
||||
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||
|
||||
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
|
||||
dst_data[i00] = src[0];
|
||||
}
|
||||
}
|
||||
template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy<float, float>;
|
||||
template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy<float, half>;
|
||||
template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy<half, half>;
|
||||
template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy<half, float>;
|
||||
|
||||
kernel void kernel_cpy_f32_q8_0(
|
||||
device const float * src0,
|
||||
@@ -5730,9 +5441,9 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
kernel void kernel_get_rows_q(
|
||||
device const void * src0,
|
||||
device const char * src1,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
@@ -5745,27 +5456,24 @@ kernel void kernel_get_rows(
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]]) {
|
||||
//const int64_t i = tgpig;
|
||||
//const int64_t r = ((device int32_t *) src1)[i];
|
||||
|
||||
const int64_t i10 = tgpig.x;
|
||||
const int64_t i11 = tgpig.y;
|
||||
|
||||
const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
const int64_t i02 = i11;
|
||||
|
||||
for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) {
|
||||
float4x4 temp;
|
||||
dequantize_func(
|
||||
((device const block_q *) ((device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
|
||||
dequantize_func(((device const block_q *) ((const device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
|
||||
*(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_f32(
|
||||
template<typename T>
|
||||
kernel void kernel_get_rows_f(
|
||||
device const void * src0,
|
||||
device const char * src1,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
@@ -5781,47 +5489,19 @@ kernel void kernel_get_rows_f32(
|
||||
const int64_t i10 = tgpig.x;
|
||||
const int64_t i11 = tgpig.y;
|
||||
|
||||
const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
const int64_t i02 = i11;
|
||||
|
||||
for (int ind = tiitg; ind < ne00; ind += tptg.x) {
|
||||
((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_f16(
|
||||
device const void * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]]) {
|
||||
const int64_t i10 = tgpig.x;
|
||||
const int64_t i11 = tgpig.y;
|
||||
|
||||
const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
const int64_t i02 = i11;
|
||||
|
||||
for (int ind = tiitg; ind < ne00; ind += tptg.x) {
|
||||
((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
|
||||
(( device float *) (( device char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((const device T *) ((const device char *) src0 + i02*nb02 + r*nb01))[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_i32(
|
||||
device const void * src0,
|
||||
device const char * src1,
|
||||
device const void * src1,
|
||||
device int32_t * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
@@ -5837,13 +5517,13 @@ kernel void kernel_get_rows_i32(
|
||||
const int64_t i10 = tgpig.x;
|
||||
const int64_t i11 = tgpig.y;
|
||||
|
||||
const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
|
||||
|
||||
const int64_t i02 = i11;
|
||||
|
||||
for (int ind = tiitg; ind < ne00; ind += tptg.x) {
|
||||
((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
|
||||
(( device int32_t *) (( device char *) dst + i11*nb2 + i10*nb1))[ind] =
|
||||
((const device int32_t *) ((const device char *) src0 + i02*nb02 + r*nb01))[ind];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5860,28 +5540,28 @@ kernel void kernel_get_rows_i32(
|
||||
#define SG_MAT_ROW 8
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
void kernel_mul_mm_impl(device const uchar * src0,
|
||||
device const uchar * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
|
||||
kernel void kernel_mul_mm(device const uchar * src0,
|
||||
device const uchar * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup half * sa = (threadgroup half *)(shared_memory);
|
||||
threadgroup T * sa = (threadgroup T *)(shared_memory);
|
||||
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
||||
|
||||
const uint r0 = tgpig.y;
|
||||
@@ -5896,7 +5576,7 @@ void kernel_mul_mm_impl(device const uchar * src0,
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
|
||||
simdgroup_half8x8 ma[4];
|
||||
simdgroup_T8x8 ma[4];
|
||||
simdgroup_float8x8 mb[2];
|
||||
simdgroup_float8x8 c_res[8];
|
||||
for (int i = 0; i < 8; i++){
|
||||
@@ -5919,7 +5599,7 @@ void kernel_mul_mm_impl(device const uchar * src0,
|
||||
|
||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||
// load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
T4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
@@ -5939,7 +5619,7 @@ void kernel_mul_mm_impl(device const uchar * src0,
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||
threadgroup T * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
||||
|
||||
#pragma unroll(4)
|
||||
@@ -6115,48 +5795,6 @@ void kernel_mul_mm_id_impl(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
kernel void kernel_mul_mm(device const uchar * src0,
|
||||
device const uchar * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
kernel_mul_mm_impl<block_q, nl, dequantize_func>(
|
||||
src0,
|
||||
src1,
|
||||
dst,
|
||||
ne00,
|
||||
ne02,
|
||||
nb01,
|
||||
nb02,
|
||||
ne12,
|
||||
nb10,
|
||||
nb11,
|
||||
nb12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_memory,
|
||||
tgpig,
|
||||
tiitg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||
kernel void kernel_mul_mm_id(
|
||||
device const uchar * src0s,
|
||||
@@ -6237,69 +5875,60 @@ kernel void kernel_mul_mm_id(
|
||||
// get rows
|
||||
//
|
||||
|
||||
typedef void (get_rows_t)(
|
||||
device const void * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
uint3, uint, uint3);
|
||||
typedef decltype(kernel_get_rows_f<float>) get_rows_f_t;
|
||||
|
||||
//template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
|
||||
//template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f<float>;
|
||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f<half>;
|
||||
|
||||
typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
|
||||
|
||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get_rows_q<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
//
|
||||
|
||||
typedef decltype(kernel_mul_mm<float4x4, 1, dequantize_f32>) mat_mm_t;
|
||||
typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@@ -6436,7 +6065,7 @@ void mmv_fn(
|
||||
impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
typedef decltype(mmv_fn<kernel_mul_mv_f32_f32_impl>) mul_mv_impl_fn_t;
|
||||
typedef decltype(mmv_fn<kernel_mul_mv_impl<half, half4, half, half4>>) mul_mv_impl_fn_t;
|
||||
|
||||
template<mul_mv_impl_fn_t impl_fn>
|
||||
kernel void kernel_mul_mv_id(
|
||||
@@ -6514,20 +6143,20 @@ kernel void kernel_mul_mv_id(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f32_f32_impl>>) kernel_mul_mv_id_t;
|
||||
typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
|
||||
|
||||
template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f32_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f16_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q2_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q3_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
|
||||
template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<half, half4, float, float4>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
|
||||
template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q2_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q3_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq1_m_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_m_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xxs_f32_impl>>;
|
||||
|
||||
+47
-47
@@ -658,7 +658,7 @@ static inline __m128i packNibbles( __m256i bytes ) {
|
||||
#endif //__loongarch_asx
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
||||
static const int qk = QK4_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -696,11 +696,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
||||
}
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q4_0_reference(x, y, k);
|
||||
quantize_row_q4_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
||||
const int qk = QK4_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -738,10 +738,10 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
|
||||
}
|
||||
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q4_1_reference(x, y, k);
|
||||
quantize_row_q4_1_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
||||
static const int qk = QK5_0;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -786,10 +786,10 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
|
||||
}
|
||||
|
||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q5_0_reference(x, y, k);
|
||||
quantize_row_q5_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
||||
const int qk = QK5_1;
|
||||
|
||||
assert(k % qk == 0);
|
||||
@@ -834,11 +834,11 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
|
||||
}
|
||||
|
||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q5_1_reference(x, y, k);
|
||||
quantize_row_q5_1_ref(x, y, k);
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
@@ -1144,12 +1144,12 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||
#else
|
||||
GGML_UNUSED(nb);
|
||||
// scalar
|
||||
quantize_row_q8_0_reference(x, y, k);
|
||||
quantize_row_q8_0_ref(x, y, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
||||
void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
||||
assert(QK8_1 == 32);
|
||||
assert(k % QK8_1 == 0);
|
||||
const int nb = k / QK8_1;
|
||||
@@ -1508,7 +1508,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
||||
#else
|
||||
GGML_UNUSED(nb);
|
||||
// scalar
|
||||
quantize_row_q8_1_reference(x, y, k);
|
||||
quantize_row_q8_1_ref(x, y, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1899,7 +1899,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
||||
|
||||
//========================- 2-bit (de)-quantization
|
||||
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
||||
void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -2002,7 +2002,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
quantize_row_q2_K_reference(x, vy, k);
|
||||
quantize_row_q2_K_ref(x, vy, k);
|
||||
}
|
||||
|
||||
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||
@@ -2226,7 +2226,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
@@ -2241,7 +2241,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
//========================= 3-bit (de)-quantization
|
||||
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
||||
void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -2368,7 +2368,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
quantize_row_q3_K_reference(x, vy, k);
|
||||
quantize_row_q3_K_ref(x, vy, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
||||
@@ -2458,7 +2458,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
@@ -2473,7 +2473,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 4-bit (de)-quantization
|
||||
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
||||
void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
@@ -2572,7 +2572,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q4_K * restrict y = vy;
|
||||
quantize_row_q4_K_reference(x, y, k);
|
||||
quantize_row_q4_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
@@ -2651,7 +2651,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
||||
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
@@ -2666,7 +2666,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 5-bit (de)-quantization
|
||||
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
||||
void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -2783,7 +2783,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q5_K * restrict y = vy;
|
||||
quantize_row_q5_K_reference(x, y, k);
|
||||
quantize_row_q5_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
@@ -2882,7 +2882,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
||||
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
@@ -2897,7 +2897,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||
|
||||
// ====================== 6-bit (de)-quantization
|
||||
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
||||
void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -3001,7 +3001,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_q6_K * restrict y = vy;
|
||||
quantize_row_q6_K_reference(x, y, k);
|
||||
quantize_row_q6_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||
@@ -3091,7 +3091,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
||||
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
}
|
||||
else {
|
||||
char * qrow = (char *)dst;
|
||||
@@ -3108,7 +3108,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(x, y, n_per_row);
|
||||
quantize_row_q4_0_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3134,7 +3134,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
||||
|
||||
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
@@ -3151,7 +3151,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
||||
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_reference(x, y, n_per_row);
|
||||
quantize_row_q4_1_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3179,7 +3179,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
||||
|
||||
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
@@ -3196,7 +3196,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
||||
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_reference(x, y, n_per_row);
|
||||
quantize_row_q5_0_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3233,7 +3233,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
||||
|
||||
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
@@ -3250,7 +3250,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
||||
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_reference(x, y, n_per_row);
|
||||
quantize_row_q5_1_ref(x, y, n_per_row);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3286,7 +3286,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
||||
|
||||
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
}
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
@@ -3302,7 +3302,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
||||
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||
quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
|
||||
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
@@ -3590,7 +3590,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
||||
void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
@@ -3641,7 +3641,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
|
||||
}
|
||||
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
||||
quantize_row_q8_K_reference(x, y, k);
|
||||
quantize_row_q8_K_ref(x, y, k);
|
||||
}
|
||||
|
||||
//===================================== Dot ptoducts =================================
|
||||
@@ -13530,10 +13530,10 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_xxs * restrict y = vy;
|
||||
quantize_row_iq3_xxs_reference(x, y, k);
|
||||
quantize_row_iq3_xxs_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
||||
}
|
||||
@@ -13746,10 +13746,10 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
||||
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_s * restrict y = vy;
|
||||
quantize_row_iq3_s_reference(x, y, k);
|
||||
quantize_row_iq3_s_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq3_s(x, y, 1, k, NULL);
|
||||
}
|
||||
@@ -14487,7 +14487,7 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
||||
void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
quantize_row_iq4_nl(x, y, k);
|
||||
}
|
||||
@@ -14515,10 +14515,10 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
||||
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq4_xs * restrict y = vy;
|
||||
quantize_row_iq4_xs_reference(x, y, k);
|
||||
quantize_row_iq4_xs_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
||||
void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq4_xs(x, y, 1, k, NULL);
|
||||
}
|
||||
@@ -14705,7 +14705,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
||||
return nrow * nblock * sizeof(block_iq2_s);
|
||||
}
|
||||
|
||||
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
||||
void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_iq2_s(x, y, 1, k, NULL);
|
||||
}
|
||||
@@ -14713,7 +14713,7 @@ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restri
|
||||
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq2_s * restrict y = vy;
|
||||
quantize_row_iq2_s_reference(x, y, k);
|
||||
quantize_row_iq2_s_ref(x, y, k);
|
||||
}
|
||||
|
||||
static bool validate_float(float f, size_t i) {
|
||||
|
||||
+17
-17
@@ -12,25 +12,25 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
+14
-102
@@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
||||
dst[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
if (item_ct1.get_group(0) < ne02) { // src0
|
||||
int offset_src =
|
||||
nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx + item_ct1.get_group(1) * ne0 +
|
||||
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
||||
const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int ne13, const float sf0, const float sf1,
|
||||
@@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
const int ne0, int ne1, int ne2, int ne02,
|
||||
queue_ptr stream) {
|
||||
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32(x, y, dst, ne0, ne02, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
||||
const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int ne13, const float sf0, const float sf1,
|
||||
@@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
#pragma message("TODO: generalize concat kernel for dim != 2")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
||||
int dim = dst->op_params[0];
|
||||
GGML_ASSERT(dim == 2);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
|
||||
}
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
@@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
|
||||
@@ -3768,37 +3703,13 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||
|
||||
const ggml_tensor_extra_gpu *src0_extra =
|
||||
(const ggml_tensor_extra_gpu *)src0->extra;
|
||||
const ggml_tensor_extra_gpu *src1_extra =
|
||||
(const ggml_tensor_extra_gpu *)src1->extra;
|
||||
const ggml_tensor_extra_gpu *dst_extra =
|
||||
(const ggml_tensor_extra_gpu *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_gpu src0_row_extra;
|
||||
ggml_tensor_extra_gpu src1_row_extra;
|
||||
ggml_tensor_extra_gpu dst_row_extra;
|
||||
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||
|
||||
src0_row.extra = &src0_row_extra;
|
||||
src1_row.extra = &src1_row_extra;
|
||||
dst_row.extra = &dst_row_extra;
|
||||
|
||||
char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)src0->data
|
||||
: (char *)src0_extra->data_device[ctx.device];
|
||||
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)src1->data
|
||||
: (char *)src1_extra->data_device[ctx.device];
|
||||
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
|
||||
? (char *)dst->data
|
||||
: (char *)dst_extra->data_device[ctx.device];
|
||||
char *src0_original = (char *)src0->data;
|
||||
char *src1_original = (char *)src1->data;
|
||||
char *dst_original = (char *)dst->data;
|
||||
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
@@ -3827,12 +3738,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
const int64_t i1 = id;
|
||||
const int64_t i2 = i12;
|
||||
|
||||
src0_row_extra.data_device[ctx.device] =
|
||||
src0_original + i02*nb02;
|
||||
src1_row_extra.data_device[ctx.device] =
|
||||
src1_original + + i11*nb11 + i12*nb12;
|
||||
dst_row_extra.data_device[ctx.device] =
|
||||
dst_original + i1*nb1 + i2*nb2;
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
src1_row.data = src1_original + + i11*nb11 + i12*nb12;
|
||||
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
||||
}
|
||||
@@ -3841,8 +3749,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
||||
ggml_sycl_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
|
||||
src1_row_extra.data_device[ctx.device] = src1_contiguous.get();
|
||||
dst_row_extra.data_device[ctx.device] = dst_contiguous.get();
|
||||
src1_row.data = src1_contiguous.get();
|
||||
dst_row.data = dst_contiguous.get();
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = 0;
|
||||
@@ -3898,7 +3806,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
});
|
||||
}
|
||||
|
||||
src0_row_extra.data_device[ctx.device] = src0_original + i02*nb02;
|
||||
src0_row.data = src0_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
@@ -4128,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
||||
func = ggml_sycl_group_norm;
|
||||
break;
|
||||
case GGML_OP_CONCAT:
|
||||
func = ggml_sycl_concat;
|
||||
func = ggml_sycl_op_concat;
|
||||
break;
|
||||
case GGML_OP_UPSCALE:
|
||||
func = ggml_sycl_upscale;
|
||||
@@ -5221,6 +5129,10 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
if (src0_type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#ifndef GGML_SYCL_BACKEND_HPP
|
||||
#define GGML_SYCL_BACKEND_HPP
|
||||
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
#include "convert.hpp"
|
||||
#include "dequantize.hpp"
|
||||
|
||||
@@ -0,0 +1,195 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
||||
const int ne0, const int ne00,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
if (nidx < ne00) { // src0
|
||||
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
||||
item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
|
||||
item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||
const int ne0, const int ne01,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
if (item_ct1.get_group(1) < ne01) { // src0
|
||||
int offset_src =
|
||||
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx + (item_ct1.get_group(1) - ne01) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||
const int ne0, const int ne02,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
if (item_ct1.get_group(0) < ne02) { // src0
|
||||
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx + item_ct1.get_group(1) * ne0 +
|
||||
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
int ne00, int ne01, int ne02, int ne0, int ne1,
|
||||
int ne2, int dim, queue_ptr stream) {
|
||||
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
switch (dim) {
|
||||
case 0:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
||||
});
|
||||
break;
|
||||
case 1:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
||||
});
|
||||
break;
|
||||
default:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
static void concat_f32_sycl_non_cont(
|
||||
queue_ptr stream, const char *src0, const char *src1, char *dst,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
|
||||
uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
|
||||
int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
|
||||
uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
|
||||
int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
|
||||
uint64_t nb3, int32_t dim) {
|
||||
sycl::range<3> gridDim(ne3, ne2, ne1);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
int64_t i3 = item_ct1.get_group(0);
|
||||
int64_t i2 = item_ct1.get_group(1);
|
||||
int64_t i1 = item_ct1.get_group(2);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
const float *x;
|
||||
|
||||
for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2)) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
|
||||
(i0)*nb00);
|
||||
} else {
|
||||
x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
|
||||
(i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
|
||||
}
|
||||
|
||||
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst) {
|
||||
queue_ptr stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *)dst->op_params)[0];
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float *src0_d = (const float *)src0->data;
|
||||
const float *src1_d = (const float *)src1->data;
|
||||
|
||||
float *dst_d = (float *)dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(
|
||||
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
|
||||
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
}
|
||||
} else
|
||||
concat_f32_sycl_non_cont(
|
||||
stream, (const char *)src0->data, (const char *)src1->data,
|
||||
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
|
||||
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
|
||||
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_CONCAT_HPP
|
||||
#define GGML_SYCL_CONCAT_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst);
|
||||
|
||||
#endif // GGML_SYCL_CONCAT_HPP
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6561,7 +6561,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
||||
ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||
@@ -6645,7 +6645,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
||||
const int idx = i3*src0->ne[2] + i2;
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6658,7 +6658,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
if (offset + src0_size >= buffer_gpu->size) {
|
||||
src0_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, src0_clone->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@@ -6687,7 +6687,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
||||
const int idx = i3*src1->ne[2] + i2;
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6700,7 +6700,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
if (offset + src1_size >= buffer_gpu->size) {
|
||||
src1_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, src1_clone->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@@ -6745,7 +6745,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
||||
const int idx = i3*src2->ne[2] + i2;
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6758,7 +6758,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
if (offset + src2_size >= buffer_gpu->size) {
|
||||
src2_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, src2_clone->data, src2_size);
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
@@ -6922,7 +6922,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
||||
ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
float first_error_result = -1.0f;
|
||||
|
||||
+57
-58
@@ -592,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
@@ -604,7 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
||||
.from_float = quantize_row_q4_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
@@ -620,7 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
||||
.from_float = quantize_row_q4_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
@@ -636,7 +636,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = false,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
@@ -648,7 +648,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = false,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
@@ -660,7 +660,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
||||
.from_float = quantize_row_q5_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
@@ -672,7 +672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
||||
.from_float = quantize_row_q5_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
@@ -684,7 +684,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
||||
.from_float = quantize_row_q8_0,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
@@ -692,7 +693,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.type_name = "q8_1",
|
||||
@@ -700,7 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.type_size = sizeof(block_q8_1),
|
||||
.is_quantized = true,
|
||||
.from_float = quantize_row_q8_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
@@ -711,7 +711,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
||||
.from_float = quantize_row_q2_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -723,7 +723,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
||||
.from_float = quantize_row_q3_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -735,7 +735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
||||
.from_float = quantize_row_q4_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -747,7 +747,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
||||
.from_float = quantize_row_q5_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -759,7 +759,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
||||
.from_float = quantize_row_q6_K,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
|
||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -771,7 +771,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -783,7 +783,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -795,7 +795,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
||||
.from_float = quantize_row_iq3_xxs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -807,7 +807,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
|
||||
.from_float = quantize_row_iq3_s,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
|
||||
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -819,7 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
||||
.from_float = quantize_row_iq2_s,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
|
||||
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -831,7 +831,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -843,7 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -855,7 +855,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
|
||||
.from_float = quantize_row_iq4_nl,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
|
||||
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
@@ -867,7 +867,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@@ -886,7 +886,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
@@ -894,48 +894,48 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
}
|
||||
@@ -3115,7 +3115,7 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
GGML_CALL int ggml_blck_size(enum ggml_type type) {
|
||||
GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
|
||||
return type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
@@ -12192,15 +12192,14 @@ static void ggml_compute_forward_mul_mat(
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat
|
||||
= type_traits[vec_dot_type].from_float_to_mat;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
ggml_gemm_t const gemm = type_traits[type].gemm;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
int64_t const blck_size_interleave = type_traits[type].blck_size_interleave;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
ggml_gemm_t const gemm = type_traits[type].gemm;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
@@ -12264,14 +12263,14 @@ UseGgmlGemm1:;
|
||||
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
||||
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
4, ne10, interleave_blcksize);
|
||||
4, ne10, blck_size_interleave);
|
||||
}
|
||||
i11_processed = ne11 - ne11 % 4;
|
||||
}
|
||||
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
||||
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12355,7 +12354,7 @@ UseGgmlGemm2:;
|
||||
int64_t src0_start = (ith * ne01) / nth;
|
||||
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
||||
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
||||
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
||||
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
||||
if (src0_start >= src0_end) return;
|
||||
|
||||
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
||||
@@ -12413,11 +12412,11 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
@@ -12458,9 +12457,9 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -19479,7 +19478,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
|
||||
fprintf(fp, "digraph G {\n");
|
||||
fprintf(fp, " newrank = true;\n");
|
||||
fprintf(fp, " rankdir = LR;\n");
|
||||
fprintf(fp, " rankdir = TB;\n");
|
||||
|
||||
for (int i = 0; i < gb->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gb->nodes[i];
|
||||
@@ -19541,7 +19540,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
}
|
||||
|
||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||
if (ggml_nelements(node) < 5) {
|
||||
if (ggml_nelements(node) < 5 && node->data != NULL) {
|
||||
fprintf(fp, " | (");
|
||||
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||
@@ -21063,8 +21062,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
(int64_t) info->ne[3];
|
||||
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
||||
__func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
set(TARGET vulkan-shaders-gen)
|
||||
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -0,0 +1,524 @@
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#include <direct.h> // For _mkdir on Windows
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#define ASYNCIO_CONCURRENCY 64
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
|
||||
std::string GLSLC = "glslc";
|
||||
std::string input_dir = "vulkan-shaders";
|
||||
std::string output_dir = "/tmp";
|
||||
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
||||
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
||||
bool no_clean = false;
|
||||
|
||||
const std::vector<std::string> type_names = {
|
||||
"f32",
|
||||
"f16",
|
||||
"q4_0",
|
||||
"q4_1",
|
||||
"q5_0",
|
||||
"q5_1",
|
||||
"q8_0",
|
||||
"q2_k",
|
||||
"q3_k",
|
||||
"q4_k",
|
||||
"q5_k",
|
||||
"q6_k"
|
||||
};
|
||||
|
||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
#ifdef _WIN32
|
||||
HANDLE stdout_read, stdout_write;
|
||||
HANDLE stderr_read, stderr_write;
|
||||
SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
|
||||
|
||||
if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
|
||||
!SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
|
||||
throw std::runtime_error("Failed to create stdout pipe");
|
||||
}
|
||||
|
||||
if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
|
||||
!SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
|
||||
throw std::runtime_error("Failed to create stderr pipe");
|
||||
}
|
||||
|
||||
PROCESS_INFORMATION pi;
|
||||
STARTUPINFOA si = { sizeof(STARTUPINFOA) };
|
||||
si.dwFlags = STARTF_USESTDHANDLES;
|
||||
si.hStdOutput = stdout_write;
|
||||
si.hStdError = stderr_write;
|
||||
|
||||
std::vector<char> cmd(command.begin(), command.end());
|
||||
cmd.push_back('\0');
|
||||
|
||||
if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
|
||||
throw std::runtime_error("Failed to create process");
|
||||
}
|
||||
|
||||
CloseHandle(stdout_write);
|
||||
CloseHandle(stderr_write);
|
||||
|
||||
std::array<char, 128> buffer;
|
||||
DWORD bytes_read;
|
||||
|
||||
while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stdout_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stderr_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
CloseHandle(stdout_read);
|
||||
CloseHandle(stderr_read);
|
||||
WaitForSingleObject(pi.hProcess, INFINITE);
|
||||
CloseHandle(pi.hProcess);
|
||||
CloseHandle(pi.hThread);
|
||||
#else
|
||||
int stdout_pipe[2];
|
||||
int stderr_pipe[2];
|
||||
|
||||
if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
|
||||
throw std::runtime_error("Failed to create pipes");
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
throw std::runtime_error("Failed to fork process");
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
dup2(stdout_pipe[1], STDOUT_FILENO);
|
||||
dup2(stderr_pipe[1], STDERR_FILENO);
|
||||
close(stdout_pipe[1]);
|
||||
close(stderr_pipe[1]);
|
||||
execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
|
||||
_exit(EXIT_FAILURE);
|
||||
} else {
|
||||
close(stdout_pipe[1]);
|
||||
close(stderr_pipe[1]);
|
||||
|
||||
std::array<char, 128> buffer;
|
||||
ssize_t bytes_read;
|
||||
|
||||
while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
|
||||
stdout_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
|
||||
stderr_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
waitpid(pid, nullptr, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool directory_exists(const std::string& path) {
|
||||
struct stat info;
|
||||
if (stat(path.c_str(), &info) != 0) {
|
||||
return false; // Path doesn't exist or can't be accessed
|
||||
}
|
||||
return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
|
||||
}
|
||||
|
||||
bool create_directory(const std::string& path) {
|
||||
#ifdef _WIN32
|
||||
return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
|
||||
#else
|
||||
return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string to_uppercase(const std::string& input) {
|
||||
std::string result = input;
|
||||
for (char& c : result) {
|
||||
c = std::toupper(c);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool string_ends_with(const std::string& str, const std::string& suffix) {
|
||||
if (suffix.size() > str.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static const char path_separator = '\\';
|
||||
#else
|
||||
static const char path_separator = '/';
|
||||
#endif
|
||||
|
||||
std::string join_paths(const std::string& path1, const std::string& path2) {
|
||||
return path1 + path_separator + path2;
|
||||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
return path.substr(path.find_last_of("/\\") + 1);
|
||||
}
|
||||
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
||||
std::string name = _name + (fp16 ? "" : "_fp32");
|
||||
std::string out_fname = join_paths(output_dir, name + ".spv");
|
||||
std::string in_path = join_paths(input_dir, in_fname);
|
||||
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||
for (const auto& define : defines) {
|
||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||
}
|
||||
|
||||
std::string command;
|
||||
for (const auto& part : cmd) {
|
||||
command += part + " ";
|
||||
}
|
||||
|
||||
std::string stdout_str, stderr_str;
|
||||
try {
|
||||
// std::cout << "Executing command: ";
|
||||
// for (const auto& part : cmd) {
|
||||
// std::cout << part << " ";
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
execute_command(command, stdout_str, stderr_str);
|
||||
if (!stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(lock);
|
||||
shader_fnames.push_back(std::make_pair(name, out_fname));
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
|
||||
std::map<std::string, std::string> result = a;
|
||||
result.insert(b.begin(), b.end());
|
||||
return result;
|
||||
}
|
||||
|
||||
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
|
||||
std::string load_vec = fp16 ? "8" : "4";
|
||||
std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
|
||||
std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
|
||||
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}};
|
||||
std::string shader_name = "matmul";
|
||||
|
||||
if (matmul_id) {
|
||||
base_dict["MUL_MAT_ID"] = "1";
|
||||
shader_name = "matmul_id";
|
||||
}
|
||||
|
||||
if (fp16) {
|
||||
base_dict["FLOAT16"] = "1";
|
||||
}
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
|
||||
|
||||
for (const auto& fp16 : {false, true}) {
|
||||
matmul_shaders(tasks, fp16, false);
|
||||
matmul_shaders(tasks, fp16, true);
|
||||
}
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
// mul mat vec
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
// Dequant shaders
|
||||
if (tname != "f16") {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
||||
}));
|
||||
}
|
||||
|
||||
if (!string_ends_with(tname, "_k")) {
|
||||
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||
|
||||
if (tname == "f16") {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
} else {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
}
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
// Norms
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
}
|
||||
|
||||
void write_output_files() {
|
||||
FILE* hdr = fopen(target_hpp.c_str(), "w");
|
||||
FILE* src = fopen(target_cpp.c_str(), "w");
|
||||
|
||||
fprintf(hdr, "#include <cstdint>\n\n");
|
||||
fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
|
||||
|
||||
for (const auto& pair : shader_fnames) {
|
||||
const std::string& name = pair.first;
|
||||
const std::string& path = pair.second;
|
||||
FILE* spv = fopen(path.c_str(), "rb");
|
||||
if (!spv) {
|
||||
std::cerr << "Error opening SPIR-V file: " << path << "\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
fseek(spv, 0, SEEK_END);
|
||||
size_t size = ftell(spv);
|
||||
fseek(spv, 0, SEEK_SET);
|
||||
|
||||
std::vector<unsigned char> data(size);
|
||||
size_t read_size = fread(data.data(), 1, size, spv);
|
||||
fclose(spv);
|
||||
if (read_size != size) {
|
||||
std::cerr << "Error reading SPIR-V file: " << path << "\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
|
||||
fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
|
||||
|
||||
fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
fprintf(src, "0x%02x,", data[i]);
|
||||
if ((i + 1) % 12 == 0) fprintf(src, "\n");
|
||||
}
|
||||
fprintf(src, "\n};\n\n");
|
||||
|
||||
if (!no_clean) {
|
||||
std::remove(path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::map<std::string, std::string> args;
|
||||
for (int i = 1; i < argc; i += 2) {
|
||||
if (i + 1 < argc) {
|
||||
args[argv[i]] = argv[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
if (args.find("--glslc") != args.end()) {
|
||||
GLSLC = args["--glslc"]; // Path to glslc
|
||||
}
|
||||
if (args.find("--input-dir") != args.end()) {
|
||||
input_dir = args["--input-dir"]; // Directory containing shader sources
|
||||
}
|
||||
if (args.find("--output-dir") != args.end()) {
|
||||
output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
|
||||
}
|
||||
if (args.find("--target-hpp") != args.end()) {
|
||||
target_hpp = args["--target-hpp"]; // Path to generated header file
|
||||
}
|
||||
if (args.find("--target-cpp") != args.end()) {
|
||||
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
||||
}
|
||||
if (args.find("--no-clean") != args.end()) {
|
||||
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
||||
}
|
||||
|
||||
if (!directory_exists(input_dir)) {
|
||||
std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (!directory_exists(output_dir)) {
|
||||
if (!create_directory(output_dir)) {
|
||||
std::cerr << "Error creating output directory: " << output_dir << "\n";
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::future<void>> tasks;
|
||||
process_shaders(tasks);
|
||||
|
||||
for (auto& task : tasks) {
|
||||
task.get();
|
||||
}
|
||||
|
||||
write_output_files();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@@ -19,6 +19,7 @@ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
||||
|
||||
class Keys:
|
||||
class General:
|
||||
TYPE = "general.type"
|
||||
ARCHITECTURE = "general.architecture"
|
||||
QUANTIZATION_VERSION = "general.quantization_version"
|
||||
ALIGNMENT = "general.alignment"
|
||||
@@ -120,11 +121,20 @@ class Keys:
|
||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
||||
|
||||
class GGUFType:
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
FALCON = auto()
|
||||
|
||||
@@ -424,6 +424,9 @@ class GGUFWriter:
|
||||
fout.close()
|
||||
self.fout = None
|
||||
|
||||
def add_type(self, type_name: str) -> None:
|
||||
self.add_string(Keys.General.TYPE, type_name)
|
||||
|
||||
def add_architecture(self) -> None:
|
||||
self.add_string(Keys.General.ARCHITECTURE, self.arch)
|
||||
|
||||
|
||||
+18
-45
@@ -3,7 +3,6 @@ from abc import ABC, ABCMeta, abstractmethod
|
||||
|
||||
import logging
|
||||
from typing import Any, Callable
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
from numpy.typing import DTypeLike
|
||||
@@ -74,20 +73,18 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
_tensor_type: type
|
||||
_meta: Any
|
||||
_data: Any | None
|
||||
_lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
|
||||
_args: tuple
|
||||
_func: Callable[[tuple], Any] | None
|
||||
_kwargs: dict[str, Any]
|
||||
_func: Callable[[Any], Any] | None
|
||||
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
|
||||
super().__init__()
|
||||
self._meta = meta
|
||||
self._data = data
|
||||
self._lazy = lazy if lazy is not None else deque()
|
||||
self._args = args
|
||||
self._kwargs = kwargs if kwargs is not None else {}
|
||||
self._func = func
|
||||
assert self._func is not None or self._data is not None
|
||||
if self._data is None:
|
||||
self._lazy.append(self)
|
||||
|
||||
def __init_subclass__(cls) -> None:
|
||||
if "_tensor_type" not in cls.__dict__:
|
||||
@@ -117,6 +114,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
args = ((use_self,) if use_self is not None else ()) + args
|
||||
|
||||
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||
# TODO: maybe handle tensors in kwargs too
|
||||
|
||||
if isinstance(meta_noop, bool) and not meta_noop:
|
||||
try:
|
||||
@@ -140,23 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
class CollectSharedLazy:
|
||||
# emulating a static variable
|
||||
shared_lazy: None | deque[LazyBase] = None
|
||||
|
||||
@staticmethod
|
||||
def collect_replace(t: LazyBase):
|
||||
if CollectSharedLazy.shared_lazy is None:
|
||||
CollectSharedLazy.shared_lazy = t._lazy
|
||||
else:
|
||||
CollectSharedLazy.shared_lazy.extend(t._lazy)
|
||||
t._lazy = CollectSharedLazy.shared_lazy
|
||||
|
||||
LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace)
|
||||
|
||||
shared_lazy = CollectSharedLazy.shared_lazy
|
||||
|
||||
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
|
||||
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
|
||||
else:
|
||||
del res # not needed
|
||||
# non-tensor return likely relies on the contents of the args
|
||||
@@ -168,26 +150,18 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
@classmethod
|
||||
def to_eager(cls, t: Any) -> Any:
|
||||
def simple_to_eager(_t: LazyBase) -> Any:
|
||||
def already_eager_to_eager(_t: LazyBase) -> Any:
|
||||
assert _t._data is not None
|
||||
if _t._data is not None:
|
||||
return _t._data
|
||||
|
||||
while _t._data is None:
|
||||
lt = _t._lazy.popleft()
|
||||
if lt._data is not None:
|
||||
# Lazy tensor did not belong in the lazy queue.
|
||||
# Weirdly only happens with Bloom models...
|
||||
# likely because tensors aren't unique in the queue.
|
||||
# The final output is still the same as in eager mode,
|
||||
# so it's safe to ignore this.
|
||||
continue
|
||||
assert lt._func is not None
|
||||
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
||||
lt._data = lt._func(lt._args)
|
||||
# sanity check
|
||||
assert lt._data is not None
|
||||
assert lt._data.dtype == lt._meta.dtype
|
||||
assert lt._data.shape == lt._meta.shape
|
||||
# NOTE: there's a recursion limit in Python (usually 1000)
|
||||
|
||||
assert _t._func is not None
|
||||
_t._args = cls._recurse_apply(_t._args, simple_to_eager)
|
||||
_t._data = _t._func(*_t._args, **_t._kwargs)
|
||||
# sanity check
|
||||
assert _t._data is not None
|
||||
assert _t._data.dtype == _t._meta.dtype
|
||||
assert _t._data.shape == _t._meta.shape
|
||||
|
||||
return _t._data
|
||||
|
||||
@@ -206,7 +180,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
@classmethod
|
||||
def from_eager(cls, t: Any) -> Any:
|
||||
if type(t) is cls:
|
||||
# already eager
|
||||
# already lazy
|
||||
return t
|
||||
elif isinstance(t, cls._tensor_type):
|
||||
return cls(meta=cls.eager_to_meta(t), data=t)
|
||||
@@ -228,8 +202,7 @@ class LazyNumpyTensor(LazyBase):
|
||||
def astype(self, dtype, *args, **kwargs):
|
||||
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||
full_args = (self, dtype,) + args
|
||||
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
||||
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
||||
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
|
||||
|
||||
def tofile(self, *args, **kwargs):
|
||||
eager = LazyNumpyTensor.to_eager(self)
|
||||
|
||||
@@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
||||
osize *= dim
|
||||
out = np.empty(shape=osize, dtype=otype)
|
||||
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||
n_groups = rows.shape[0] // 16
|
||||
n_groups = (rows.shape[0] // 16) or 1
|
||||
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||
return out.reshape(oshape)
|
||||
|
||||
|
||||
@@ -602,14 +602,12 @@ class TensorNameMap:
|
||||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
# TODO: make this configurable
|
||||
n_experts = 160
|
||||
for xid in range(n_experts):
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid, xid = xid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
key = key.format(bid = bid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
|
||||
@@ -27,8 +27,9 @@ UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
|
||||
|
||||
# For more information about what field.parts and field.data represent,
|
||||
# please see the comments in the modify_gguf.py example.
|
||||
def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
|
||||
def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar: bool, no_layer: bool) -> None:
|
||||
sha1 = hashlib.sha1()
|
||||
sha256 = hashlib.sha256()
|
||||
uuidv5_sha1 = hashlib.sha1()
|
||||
uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
|
||||
|
||||
@@ -50,7 +51,7 @@ def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
|
||||
bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar)
|
||||
|
||||
# Hashing Process
|
||||
for n, tensor in enumerate(reader.tensors, 1):
|
||||
for tensor in reader.tensors:
|
||||
|
||||
# We don't need these
|
||||
if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
@@ -62,29 +63,39 @@ def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
|
||||
sum_weights_in_tensor *= dim
|
||||
bar.update(sum_weights_in_tensor)
|
||||
|
||||
sha1_layer = hashlib.sha1()
|
||||
sha1_layer.update(tensor.data.data)
|
||||
if not no_layer:
|
||||
|
||||
sha1_layer = hashlib.sha1()
|
||||
sha1_layer.update(tensor.data.data)
|
||||
print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
|
||||
|
||||
sha256_layer = hashlib.sha256()
|
||||
sha256_layer.update(tensor.data.data)
|
||||
print("sha256 {0} {1}:{2}".format(sha256_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
|
||||
|
||||
sha1.update(tensor.data.data)
|
||||
sha256.update(tensor.data.data)
|
||||
uuidv5_sha1.update(tensor.data.data)
|
||||
print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
|
||||
|
||||
# Flush Hash Progress Bar
|
||||
bar.close()
|
||||
|
||||
# Display Hash Output
|
||||
print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
|
||||
print("UUIDv5 {0} {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
|
||||
print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
|
||||
print("sha256 {0} {1}".format(sha256.hexdigest(), filename)) # noqa: NP100
|
||||
print("uuid {0} {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
|
||||
parser.add_argument("model", type=str, help="GGUF format model filename")
|
||||
parser.add_argument("--no-layer", action="store_true", help="exclude per layer hash")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--progressbar", action="store_true", help="enable progressbar")
|
||||
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
reader = GGUFReader(args.model, 'r')
|
||||
gguf_hash(reader, args.model, not args.progressbar)
|
||||
gguf_hash(reader, args.model, not args.progressbar, args.no_layer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
+26
-13
@@ -133,7 +133,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
@@ -411,6 +411,9 @@ extern "C" {
|
||||
const char * content;
|
||||
} llama_chat_message;
|
||||
|
||||
// lora adapter
|
||||
struct llama_lora_adapter;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
@@ -510,18 +513,28 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
// Load a LoRA adapter from file
|
||||
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
|
||||
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
// This will not modify model's weight
|
||||
LLAMA_API int32_t llama_lora_adapter_set(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter,
|
||||
float scale);
|
||||
|
||||
// Remove a LoRA adapter from given context
|
||||
// Return -1 if the adapter is not present in the context
|
||||
LLAMA_API int32_t llama_lora_adapter_remove(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
|
||||
@@ -9,3 +9,4 @@
|
||||
-r ./requirements/requirements-convert_hf_to_gguf.txt
|
||||
-r ./requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
-r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
|
||||
-r ./requirements/requirements-convert_lora_to_gguf.txt
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
-r ./requirements-convert_hf_to_gguf.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
@@ -1,2 +1,3 @@
|
||||
docstring_parser~=0.15
|
||||
pydantic~=2.6.3
|
||||
requests
|
||||
|
||||
+593
-559
File diff suppressed because it is too large
Load Diff
@@ -14,7 +14,7 @@
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
// ggml.c::quantize_row_q4_0_ref
|
||||
inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
@@ -24,7 +24,7 @@ inline static float silu_orig(float x) {
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
// ggml.c::quantize_row_q4_0_ref
|
||||
inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
|
||||
@@ -60,7 +60,7 @@ static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test
|
||||
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
|
||||
qfns.from_float_reference(test_data, tmp_q.data(), test_size);
|
||||
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
|
||||
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns.from_float_reference(test_data1, test_q1, size);
|
||||
qfns.from_float_ref(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
||||
@@ -195,7 +195,7 @@ int main(int argc, char **argv) {
|
||||
const bool add_special = false;
|
||||
|
||||
for (const auto & test_kv : k_tests) {
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, true);
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
|
||||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
@@ -253,7 +253,7 @@ int main(int argc, char **argv) {
|
||||
{
|
||||
const auto t_start = ggml_time_us();
|
||||
|
||||
res = llama_tokenize(ctx, text, add_special, true);
|
||||
res = llama_tokenize(ctx, text, add_special, false);
|
||||
|
||||
const auto t_end = ggml_time_us();
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ from typing import Any, Iterator, cast
|
||||
from typing_extensions import Buffer
|
||||
|
||||
import cffi
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer
|
||||
|
||||
|
||||
logger = logging.getLogger("test-tokenizer-random")
|
||||
@@ -129,7 +129,7 @@ class Tokenizer:
|
||||
class TokenizerGroundtruth (Tokenizer):
|
||||
|
||||
def __init__(self, dir_tokenizer: str):
|
||||
self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||
# guess BOS and EOS
|
||||
ids = self.encode("a")
|
||||
assert 1 <= len(ids) <= 3
|
||||
@@ -143,7 +143,7 @@ class TokenizerGroundtruth (Tokenizer):
|
||||
self.vocab = list(sorted(self.vocab))
|
||||
# tokens and lists
|
||||
self.special_tokens = list(self.model.all_special_tokens)
|
||||
self.added_tokens = list(self.model.added_tokens_encoder)
|
||||
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
|
||||
self.bos_token = self.model.bos_token
|
||||
self.eos_token = self.model.eos_token
|
||||
|
||||
@@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||
'a\na', # bert fail
|
||||
'"`', # falcon
|
||||
' \u2e4e', # falcon
|
||||
'\n\x0b ', # falcon
|
||||
'a\xa0\xa0\x00b', # jina-v2-es
|
||||
'one <mask>', # jina-v2-es <mask> lstrip=true
|
||||
'a </s> b', # rstrip phi-3
|
||||
|
||||
Reference in New Issue
Block a user