Compare commits

..

24 Commits

Author SHA1 Message Date
Christian Kastner 21fcc21ad5 cmake: Factor out CPU architecture detection (#13883)
* cmake: Define function for querying architecture

The tests and results match exactly those of ggml/src/CMakeLists.txt

* Switch arch detection over to new function
2025-05-29 12:50:25 +02:00
Vineel Abhinav dd8ba93416 ggml: aarch64: Implement SVE F32 kernels for Mamba Sequential Scan Algorithm (#13882)
* F32-Mamba-Seq_Scan-SVE

* Fix formatting

* ggml : missing space

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-05-29 12:18:43 +03:00
Georgi Gerganov 66c92061f5 tests : remove json.hpp from a test (#13880)
ggml-ci
2025-05-29 12:17:16 +03:00
Sigbjørn Skjæret 5ca82fc1d7 convert : workaround for AutoConfig dummy labels (#13881) 2025-05-29 10:00:57 +02:00
Sigbjørn Skjæret 6385b843a8 llama : add RobertaForSequenceClassification reranker support (#13875) 2025-05-29 08:15:01 +02:00
Vineel Abhinav 1b8fb8152d ggml: aarch64: Implement SVE F32 kernels for vector functions (#13843)
* F32-Mamba-SVE

* F32-Mamba-SVE

* Resolve test errors-1

* Resolve test errors-2

* F32-vec-SVE

* F32-vec-SVE

* F32-vec-SVE
2025-05-29 09:01:33 +03:00
Beinsezii 53ae30640e gguf-py : fix SafetensorRemote return on undefined size (< 0) (#13841) 2025-05-28 23:50:20 +02:00
Xuan-Son Nguyen 763d06edb7 llama : fix KV shift for qwen2vl (#13870)
* llama : fix KV shift for qwen2vl

* add ref to the PR
2025-05-28 22:35:31 +02:00
Xuan-Son Nguyen 10961339b2 mtmd : move helpers to dedicated library (⚠️ breaking change) (#13866)
* mtmd : move helpers to dedicated library

* fix server build

* rm leftover cmakelist code
2025-05-28 22:35:22 +02:00
bandoti d98f2a35fc ci: disable LLAMA_CURL for Linux cross-builds (#13871) 2025-05-28 15:46:47 -03:00
Đinh Trọng Huy e0e3aa231d llama : add support for BertForSequenceClassification reranker (#13858)
* convert: add support for BertForSequenceClassification

* add support for reranking using BertForSequenceClassification

* merge checks of eos and sep

* fix lint

---------

Co-authored-by: dinhhuy <huy.dinh@brains-tech.co.jp>
2025-05-28 19:01:58 +02:00
Đinh Trọng Huy aa6dff05be convert: small addition to support LlamaModel (#13838)
Co-authored-by: dinhhuy <huy.dinh@brains-tech.co.jp>
2025-05-28 16:34:18 +02:00
Sky c962ae3382 server: fix remove 'image_url'/'input_audio' json-object effectlly for 'llama_params' in multimodal-model-mode (#13853)
[fix]: remove 'image_url'/'input_audio' effectlly for 'llama_params' in multimodal-model-mode
2025-05-28 16:33:54 +02:00
Xuan-Son Nguyen a3938fb53d convert : fix qwen omni conversion (#13859)
* convert : fix qwen omni conversion

* fix typo
2025-05-28 16:12:35 +02:00
Alex Fanthome f7873fc698 tests : change umlaut test (#11600) 2025-05-28 15:49:28 +02:00
Johannes Gäßler a68247439b CUDA: fix FA tg at long context for CC >= 8.9 (#13852) 2025-05-28 13:33:37 +02:00
Xuan-Son Nguyen 26b79b6cb3 convert : fix tensor naming conflict for llama 4 vision (#13836)
* convert : fix tensor naming conflict for llama 4 vision

* add comment
2025-05-28 10:05:54 +02:00
leo-pony 1e8659e65a CANN: Add SOC TYPE printing in cmake configuration (#13837) 2025-05-28 11:54:20 +08:00
lhez a3c30846e4 opencl: add new ops - argsort, div, sub, addrows, sigmoid, group_norm (#13787)
* opencl: add `argsort`

* opencl: add `div`

* opencl: add `add_rows`

* opencl: add `sub`

* opencl: add `sigmoid`, both `f16` and `f32`

* opencl: add `group_norm`
2025-05-27 12:56:08 -07:00
lhez 1701d4c54f opencl: mark mul_mat f32f32 as supporting non-contiguous tensors (#13790) 2025-05-27 12:53:14 -07:00
Jeff Bolz bef8176387 vulkan: use timestamp queries for GGML_VULKAN_PERF (#13817)
Also change it to be controlled by an env var rather than cmake flag
2025-05-27 18:39:07 +02:00
Georgi Gerganov 34b7c0439e cmake : add llama-cparams.cpp to build (#13832) 2025-05-27 19:08:44 +03:00
Akarshan Biswas f3101a8cc6 SYCL: add gelu_erf kernel (#13749)
* SYCL: add gelu_erf kernel

* refactor code

Co-authored-by: Atharva Dubey <atharva.dubey@codeplay.com>

* Use scope_op_debug_print

---------

Co-authored-by: Atharva Dubey <atharva.dubey@codeplay.com>
2025-05-27 20:52:59 +05:30
Georgi Gerganov 1c49c70d07 sync : ggml 2025-05-27 18:05:33 +03:00
54 changed files with 2206 additions and 545 deletions
+1 -1
View File
@@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[tools/mtmd/miniaudio.h]
[tools/mtmd/vendor/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
+15 -15
View File
@@ -26,12 +26,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
g++-14-riscv64-linux-gnu
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
libvulkan-dev:riscv64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
libvulkan-dev:arm64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libcurl4-openssl-dev:ppc64el
g++-14-powerpc64le-linux-gnu
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el \
libcurl4-openssl-dev:ppc64el
libvulkan-dev:ppc64el
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
+9 -6
View File
@@ -903,13 +903,16 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false;
}
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
+36 -12
View File
@@ -423,19 +423,19 @@ class ModelBase:
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
config = json.load(f)
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1207,7 +1207,7 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
else:
if not self.has_vision_encoder and not self.has_audio_encoder:
raise ValueError("MmprojModel must have either vision or audio encoder")
def write_vocab(self):
@@ -1841,7 +1841,8 @@ class StableLMModel(TextModel):
"MistralForCausalLM",
"MixtralForCausalLM",
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration")
"LlavaForConditionalGeneration",
"LlamaModel")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
@@ -1921,6 +1922,8 @@ class LlamaModel(TextModel):
if is_vision_tensor:
return [] # skip vision tensors
elif self.hf_arch == "LlamaModel":
name = "model." + name
elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM
elif name.startswith("language_model."):
@@ -2169,6 +2172,9 @@ class Llama4VisionModel(MmprojModel):
# process vision tensors
if "positional_embedding_vlm" in name and ".weight" not in name:
name += ".weight"
if "multi_modal_projector.linear_1" in name:
# despite the name with number postfix, this is a single fully connected layer
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
return [(self.map_tensor_name(name), data_torch)]
return []
@@ -3676,7 +3682,7 @@ class InternLM3Model(TextModel):
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
class BertModel(TextModel):
model_arch = gguf.MODEL_ARCH.BERT
@@ -3684,11 +3690,21 @@ class BertModel(TextModel):
super().__init__(*args, **kwargs)
self.vocab_size = None
if cls_out_labels := self.hparams.get("id2label"):
if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
# Remove dummy labels added by AutoConfig
cls_out_labels = None
self.cls_out_labels = cls_out_labels
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_causal_attention(False)
self._try_set_pooling_type()
if self.cls_out_labels:
key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
def set_vocab(self):
tokens, toktypes, tokpre = self.get_vocab_base()
self.vocab_size = len(tokens)
@@ -3739,6 +3755,14 @@ class BertModel(TextModel):
if name.startswith("cls.seq_relationship"):
return []
if self.cls_out_labels:
# For BertForSequenceClassification (direct projection layer)
if name == "classifier.weight":
name = "classifier.out_proj.weight"
if name == "classifier.bias":
name = "classifier.out_proj.bias"
return [(self.map_tensor_name(name), data_torch)]
def _xlmroberta_tokenizer_init(self) -> None:
@@ -3833,7 +3857,7 @@ class BertModel(TextModel):
self.gguf_writer.add_add_eos_token(True)
@ModelBase.register("RobertaModel")
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
class RobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
+1 -1
View File
@@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
tests = [
"ied 4 ½ months",
"Führer",
"Äpfel",
"",
" ",
" ",
-1
View File
@@ -177,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
+25
View File
@@ -24,3 +24,28 @@ function(ggml_get_flags CCID CCVER)
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction()
function(ggml_get_system_arch)
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
else()
set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
endif()
endfunction()
+2
View File
@@ -109,6 +109,8 @@ if (MSVC)
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
ggml_get_system_arch()
message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
if (NOT MSVC)
if (GGML_STATIC)
+1
View File
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
if (CANN_INSTALL_DIR)
# Only Support Linux.
+7 -17
View File
@@ -82,13 +82,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
endif()
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
message(STATUS "ARM detected")
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
else()
@@ -170,12 +165,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endforeach()
endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
message(STATUS "x86 detected")
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
@@ -318,7 +309,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
endif()
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
message(STATUS "PowerPC detected")
if (GGML_NATIVE)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -344,9 +335,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (GGML_LASX)
list(APPEND ARCH_FLAGS -mlasx)
@@ -354,8 +344,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
message(STATUS "RISC-V detected")
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
message(STATUS "riscv64 detected")
if (GGML_RVV)
if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -365,7 +355,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected")
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
+215 -100
View File
@@ -7633,39 +7633,83 @@ static void ggml_compute_forward_ssm_scan_f32(
const int ir1 = MIN(ir0 + dr, nr);
const int ir = ir1 - ir0;
for (int i3 = 0; i3 < n_s; ++i3) {
for (int i2 = 0; i2 < n_t; ++i2) {
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
#ifdef __ARM_FEATURE_SVE
for (int i3 = 0; i3 < n_s; ++i3) {
for (int i2 = 0; i2 < n_t; ++i2) {
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
// use the output as the source for the next token-wise iterations
if (i2 > 0) { s0 = s; }
// use the output as the source for the next token-wise iterations
if (i2 > 0) { s0 = s; }
// d_inner
for (int i1 = 0; i1 < ir; ++i1) {
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
float x_dt = x[i1] * dt_soft_plus;
float sumf = 0.0f;
// d_state
for (int i0 = 0; i0 < nc; ++i0) {
int i = i0 + i1*nc;
// state = prev_state * dA + dB * x
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
// y = rowwise_dotprod(state, C)
sumf += state * C[i0];
s[i] = state;
// d_inner
for (int i1 = 0; i1 < ir; ++i1) {
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
float x_dt = x[i1] * dt_soft_plus;
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
for (int64_t k = 0; k < nc; k += svcntw()) {
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
t1 = exp_ps_sve(svptrue_b32(), t1);
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
}
y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
}
y[i1] = sumf;
}
}
}
#else
for (int i3 = 0; i3 < n_s; ++i3) {
for (int i2 = 0; i2 < n_t; ++i2) {
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
// use the output as the source for the next token-wise iterations
if (i2 > 0) { s0 = s; }
// d_inner
for (int i1 = 0; i1 < ir; ++i1) {
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
float x_dt = x[i1] * dt_soft_plus;
float sumf = 0.0f;
// d_state
for (int i0 = 0; i0 < nc; ++i0) {
int i = i0 + i1*nc;
// state = prev_state * dA + dB * x
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
// y = rowwise_dotprod(state, C)
sumf += state * C[i0];
s[i] = state;
}
y[i1] = sumf;
}
}
}
#endif
}
void ggml_compute_forward_ssm_scan(
@@ -8070,6 +8114,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
#define GGML_F32X_MUL GGML_F32x16_MUL
#define GGML_F32X_FMA GGML_F32x16_FMA
#define WKV_VECTOR_SIZE 16
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
#define GGML_F32X GGML_F32xt
#define GGML_F32X_SET1 GGML_F32xt_SET1
#define GGML_F32X_LOAD GGML_F32xt_LOAD
#define GGML_F32X_STORE GGML_F32xt_STORE
#define GGML_F32X_MUL GGML_F32xt_MUL
#define GGML_F32X_FMA GGML_F32xt_FMA
#define WKV_VECTOR_SIZE 8
#elif defined(__ARM_NEON) && defined(__aarch64__)
#define GGML_F32X GGML_F32x4
#define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8080,8 +8132,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
#define WKV_VECTOR_SIZE 4
#endif
int wkv_vector_size;
#ifdef WKV_VECTOR_SIZE
const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
#if defined(__ARM_FEATURE_SVE)
wkv_vector_size = svcntw();
#else
wkv_vector_size = WKV_VECTOR_SIZE;
#endif
const int64_t vec_count = head_size / wkv_vector_size;
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
@@ -8111,7 +8169,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
for (int64_t j = 0; j < vec_count; j++) {
size_t base_j = j * WKV_VECTOR_SIZE;
size_t base_j = j * wkv_vector_size;
size_t t_h_j_offset = t_h_offset + base_j;
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8136,7 +8194,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
}
// Handle remaining elements, this will not be used.
for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
@@ -8272,6 +8330,14 @@ static void ggml_compute_forward_gla_f32(
#define GGML_F32X_MUL GGML_F32x16_MUL
#define GGML_F32X_FMA GGML_F32x16_FMA
#define GLA_VECTOR_SIZE 16
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
#define GGML_F32X GGML_F32xt
#define GGML_F32X_SET1 GGML_F32xt_SET1
#define GGML_F32X_LOAD GGML_F32xt_LOAD
#define GGML_F32X_STORE GGML_F32xt_STORE
#define GGML_F32X_MUL GGML_F32xt_MUL
#define GGML_F32X_FMA GGML_F32xt_FMA
#define GLA_VECTOR_SIZE 8
#elif defined(__ARM_NEON) && defined(__aarch64__)
#define GGML_F32X GGML_F32x4
#define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8282,8 +8348,14 @@ static void ggml_compute_forward_gla_f32(
#define GLA_VECTOR_SIZE 4
#endif
int gla_vector_size;
#ifdef GLA_VECTOR_SIZE
const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
#if defined(__ARM_FEATURE_SVE)
gla_vector_size = svcntw();
#else
gla_vector_size = GLA_VECTOR_SIZE;
#endif
const int64_t vec_count = head_size / gla_vector_size;
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
@@ -8310,7 +8382,7 @@ static void ggml_compute_forward_gla_f32(
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
for (int64_t j = 0; j < vec_count; j++) {
size_t base_j = j * GLA_VECTOR_SIZE;
size_t base_j = j * gla_vector_size;
size_t t_h_j_offset = t_h_offset + base_j;
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8334,7 +8406,7 @@ static void ggml_compute_forward_gla_f32(
}
// Handle remaining elements, this will not be used.
for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
@@ -8443,83 +8515,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
int64_t h_stride_2d = head_size * head_size;
#if defined(GGML_SIMD)
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
#if defined(__ARM_FEATURE_SVE)
// scalar Route to scalar implementation //TODO: Write SVE code
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t ii = 0; ii < head_size; ii++) {
int64_t t_h_i_offset = t_h_offset + ii;
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
for (int64_t i = 0; i < head_size; i++) {
int64_t t_h_i_offset = t_h_offset + i;
int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
float v_val = v[t_h_i_offset];
float sa = 0;
{
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
}
float sa = 0, result = 0;
for (int64_t j = 0; j < head_size; j++) {
sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
}
GGML_F32_VEC_REDUCE(sa, sum);
}
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
for (int64_t j = 0; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
int64_t j = 0;
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
for (; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
// kv + s * decay + sa * b
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v_val * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
result += state_cur[h_2d_i_j_offset] * r_val;
}
}
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
// There shouldn't be left-overs though.
for (; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v[t_h_i_offset] * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
dst_data[t_h_i_offset] = result;
}
}
}
}
#else
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t ii = 0; ii < head_size; ii++) {
int64_t t_h_i_offset = t_h_offset + ii;
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
float sa = 0;
{
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
}
}
GGML_F32_VEC_REDUCE(sa, sum);
}
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
int64_t j = 0;
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
for (; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
// kv + s * decay + sa * b
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
}
}
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
// There shouldn't be left-overs though.
for (; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v[t_h_i_offset] * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
}
}
}
}
#endif
#else
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
+117 -1
View File
@@ -17,7 +17,123 @@
// number of elements to fit in a single register
//
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
#define GGML_SIMD
// F32 SVE
#define GGML_F32_EPR 8
#define DEFAULT_PG svptrue_b32()
#define GGML_F32xt svfloat32_t
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
{ \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
}
#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32_VEC GGML_F32xt
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
#define GGML_F32_VEC_STORE GGML_F32xt_STORE
#define GGML_F32_VEC_FMA GGML_F32xt_FMA
#define GGML_F32_VEC_ADD GGML_F32xt_ADD
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
// F16 NEON
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#define GGML_F16_STEP 32
#define GGML_F16_EPR 8
#define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
#define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
#define GGML_F16x8_MUL vmulq_f16
#define GGML_F16x8_REDUCE(res, x) \
do { \
int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
} while (0)
#define GGML_F16_VEC GGML_F16x8
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
#else
// if FP16 vector arithmetic is not supported, we use FP32 instead
// and take advantage of the vcvt_ functions to convert to/from FP16
#define GGML_F16_STEP 16
#define GGML_F16_EPR 4
#define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32
#define GGML_F32Cx4_MUL vmulq_f32
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
#define GGML_F16_VEC GGML_F32Cx4
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
#define GGML_SIMD
+85 -16
View File
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
#if defined(GGML_SIMD)
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t sum1 = svdup_n_f32(0.0f);
svfloat32_t sum2 = svdup_n_f32(0.0f);
svfloat32_t sum3 = svdup_n_f32(0.0f);
svfloat32_t sum4 = svdup_n_f32(0.0f);
svfloat32_t sum5 = svdup_n_f32(0.0f);
svfloat32_t sum6 = svdup_n_f32(0.0f);
svfloat32_t sum7 = svdup_n_f32(0.0f);
svfloat32_t sum8 = svdup_n_f32(0.0f);
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
for (int i = 0; i < np; i += ggml_f32_step) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
}
}
// leftovers
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
const int np2 = (n & ~(ggml_f32_epr - 1));
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
}
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np2 < n) {
svbool_t pg = svwhilelt_b32(np2, n);
ax1 = svld1_f32(pg, x + np2);
ay1 = svld1_f32(pg, y + np2);
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
}
// reduce sum1,sum2 to sum1
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
#else
const int np = (n & ~(GGML_F32_STEP - 1));
// reduce sum0..sum3 to sum0
GGML_F32_VEC_REDUCE(sumf, sum);
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
// leftovers
for (int i = np; i < n; ++i) {
sumf += x[i]*y[i];
}
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
}
}
// reduce sum0..sum3 to sum0
GGML_F32_VEC_REDUCE(sumf, sum);
// leftovers
for (int i = np; i < n; ++i) {
sumf += x[i]*y[i];
}
#endif
#else
// scalar
ggml_float sumf = 0.0;
+211 -56
View File
@@ -5,6 +5,7 @@
#include "ggml-impl.h"
#include "simd-mappings.h"
#include "ggml.h"
#include "ggml-cpu.h"
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1));
#if defined(__ARM_FEATURE_SVE)
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
for (int i = 0; i < np; i += ggml_f32_step) {
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
GGML_F32_VEC_STORE(y + i, ay1);
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
}
}
// leftovers
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
const int np2 = (n & ~(ggml_f32_epr - 1));
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
// leftovers
for (int i = np; i < n; ++i) {
y[i] += x[i]*v;
}
GGML_F32_VEC_STORE(y + i, ay1);
}
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np2 < n) {
svbool_t pg =svwhilelt_b32(np2, n);
ax1 = svld1_f32(pg, x + np2);
ay1 = svld1_f32(pg, y + np2);
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
svst1_f32(pg, y + np2, ay1);
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] += x[i]*v;
}
#endif
#else
// scalar
for (int i = 0; i < n; ++i) {
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
}
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
}
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
#if defined(__ARM_FEATURE_SVE)
// scalar Route to scalar implementation //TODO: Write SVE code
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = 0; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
}
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
// leftovers
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = np; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
}
}
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
}
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = np; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
}
}
#endif
#else
// scalar
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#if defined(GGML_USE_ACCELERATE)
vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
const int np = (n & ~(GGML_F32_STEP - 1));
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 2 * ggml_f32_epr;
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t ay1;
svfloat32_t ay2;
for (int i = 0; i < np; i += ggml_f32_step) {
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_MUL(ay1, vx);
GGML_F32_VEC_STORE(y + i, ay1);
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_MUL(ay2, vx);
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
}
}
// leftovers
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np < n) {
svbool_t pg = svwhilelt_b32(np, n);
ay1 = svld1_f32(pg, y + np);
ay1 = svmul_f32_m(pg, ay1, vx);
svst1_f32(pg, y + np, ay1);
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
// leftovers
for (int i = np; i < n; ++i) {
y[i] *= v;
}
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] *= v;
}
#endif
#else
// scalar
for (int i = 0; i < n; ++i) {
@@ -528,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
#endif
/* Below function was borrowed from the GitHub repository:
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
// Constants
const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
const svfloat32_t one = svdup_n_f32(1.0f);
const svfloat32_t inactive1 = svdup_n_f32(0.0f);
const svint32_t inactive2 = svdup_n_s32(0);
// Algorithm starts here
svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
t1 = svadd_f32_m(pg, t1, one); // b = a + 1
svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
// and_(t2.d, t1.d, not_mask17.d)
svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
t5 = svsub_f32_m(pg, t1, t5); // z
t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
t0 = svmul_f32_m(pg, t0, t4); // Final result
return t0;
}
#endif
#if defined(__ARM_NEON) && defined(__aarch64__)
// adapted from arm limited optimized routine
+2 -2
View File
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
__builtin_assume(tid < D);
extern __shared__ float2 meta[];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
for (int i = tid; i < 2*parallel_blocks; i += D) {
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
}
__syncthreads();
+6
View File
@@ -55,14 +55,17 @@ endfunction()
set(GGML_OPENCL_KERNELS
add
argsort
clamp
cpy
cvt
diag_mask_inf
div
gelu
gemv_noshuffle_general
gemv_noshuffle
get_rows
group_norm
im2col_f32
im2col_f16
mul_mat_Ab_Bi_8x4
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
rms_norm
rope
scale
sigmoid
silu
softmax_4_f32
softmax_4_f16
softmax_f32
softmax_f16
sub
sum_rows
transpose
)
+638 -2
View File
@@ -299,27 +299,37 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mv_f16_f32;
cl_program program_mul_mv_f32_f32;
cl_program program_mul;
cl_program program_div;
cl_program program_sub;
cl_program program_norm;
cl_program program_relu;
cl_program program_rms_norm;
cl_program program_group_norm;
cl_program program_rope;
cl_program program_scale;
cl_program program_silu;
cl_program program_sigmoid;
cl_program program_softmax_f32;
cl_program program_softmax_f16;
cl_program program_softmax_4_f32;
cl_program program_softmax_4_f16;
cl_program program_argsort_f32_i32;
cl_program program_sum_rows_f32;
cl_kernel kernel_add, kernel_add_row;
cl_kernel kernel_mul, kernel_mul_row;
cl_kernel kernel_div, kernel_div_row;
cl_kernel kernel_sub, kernel_sub_row;
cl_kernel kernel_scale;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
cl_kernel kernel_relu;
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
cl_kernel kernel_clamp;
cl_kernel kernel_norm;
cl_kernel kernel_rms_norm;
cl_kernel kernel_group_norm;
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
cl_kernel kernel_soft_max, kernel_soft_max_4;
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -339,6 +349,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// Transpose kernels
@@ -986,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// argsort
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "argsort.cl.h"
};
#else
const std::string kernel_src = read_file("argsort.cl");
#endif
backend_ctx->program_argsort_f32_i32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
GGML_LOG_CONT(".");
}
// div
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "div.cl.h"
};
#else
const std::string kernel_src = read_file("div.cl");
#endif
backend_ctx->program_div =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
GGML_LOG_CONT(".");
}
// sub
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sub.cl.h"
};
#else
const std::string kernel_src = read_file("sub.cl");
#endif
backend_ctx->program_sub =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
GGML_LOG_CONT(".");
}
// sum_rows
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sum_rows.cl.h"
};
#else
const std::string kernel_src = read_file("sum_rows.cl");
#endif
backend_ctx->program_sum_rows_f32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
GGML_LOG_CONT(".");
}
// sigmoid
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sigmoid.cl.h"
};
#else
const std::string kernel_src = read_file("sigmoid.cl");
#endif
backend_ctx->program_sigmoid =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
GGML_LOG_CONT(".");
}
// group_norm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "group_norm.cl.h"
};
#else
const std::string kernel_src = read_file("group_norm.cl");
#endif
backend_ctx->program_group_norm =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
GGML_LOG_CONT(".");
}
// Adreno kernels
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// transpose
@@ -1856,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_ADD:
case GGML_OP_SCALE:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SUB:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
@@ -1863,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU_QUICK:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_UNARY_OP_SIGMOID:
return ggml_is_contiguous(op->src[0]);
default:
return false;
}
@@ -1873,11 +1988,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return true;
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_MUL_MAT:
if (op->src[0]->type == GGML_TYPE_F16) {
return true;
} else if (op->src[0]->type == GGML_TYPE_F32) {
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
return op->src[1]->type == GGML_TYPE_F32;
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q6_K) {
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
@@ -1912,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
}
case GGML_OP_IM2COL:
return true;
case GGML_OP_ARGSORT:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SUM_ROWS:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
default:
return false;
}
@@ -3238,6 +3359,256 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
}
}
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(src0));
// src1 is a row
GGML_ASSERT(ne11 == 1);
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_div_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_div;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
}
if (bcast_row) {
int n = ggml_nelements(dst)/4;
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {nth, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
}
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(src0));
// src1 is a row
GGML_ASSERT(ne11 == 1);
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_sub_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_sub;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
}
if (bcast_row) {
int n = ggml_nelements(dst)/4;
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {nth, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
}
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -3429,6 +3800,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
#endif
}
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sigmoid_f32;
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
kernel = backend_ctx->kernel_sigmoid_f16;
} else {
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
const int64_t n = ggml_nelements(dst);
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
}
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -3626,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
#endif
}
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
float eps = ((const float *) dst->op_params)[1];
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne = ne00*ne01*ne02;
cl_kernel kernel = backend_ctx->kernel_group_norm;
size_t sgs = 64;
if (backend_ctx->gpu_family == ADRENO) {
sgs = 64;
} else if (backend_ctx->gpu_family == INTEL) {
sgs = 32;
} else {
GGML_ASSERT(false && "Unsupported GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
size_t local_work_size[] = {(size_t)sgs, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -4975,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
#endif
}
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int nrows = ggml_nrows(src0);
int ne00_padded = 1;
while (ne00_padded < ne00) {
ne00_padded *= 2;
}
int order = (enum ggml_sort_order) dst->op_params[0];
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
//------------------------------------------------------------------------------
// Op offloading
//------------------------------------------------------------------------------
@@ -5023,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_mul;
break;
case GGML_OP_DIV:
if (!any_on_device) {
return false;
}
func = ggml_cl_div;
break;
case GGML_OP_SUB:
if (!any_on_device) {
return false;
}
func = ggml_cl_sub;
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_GELU:
@@ -5049,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_relu;
break;
case GGML_UNARY_OP_SIGMOID:
if (!any_on_device) {
return false;
}
func = ggml_cl_sigmoid;
break;
default:
return false;
} break;
@@ -5070,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_rms_norm;
break;
case GGML_OP_GROUP_NORM:
if (!any_on_device) {
return false;
}
func = ggml_cl_group_norm;
break;
case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
return false;
@@ -5115,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_im2col;
break;
case GGML_OP_ARGSORT:
if (!any_on_device) {
return false;
}
func = ggml_cl_argsort;
break;
case GGML_OP_SUM_ROWS:
if (!any_on_device) {
return false;
}
func = ggml_cl_sum_rows;
break;
default:
return false;
}
+86
View File
@@ -0,0 +1,86 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
enum ggml_sort_order {
GGML_SORT_ORDER_ASC,
GGML_SORT_ORDER_DESC,
};
kernel void kernel_argsort_f32_i32(
global float * src0,
ulong offset0,
global int * dst,
ulong offsetd,
const int ne00,
const int ne00_pad,
const int order,
local int * dst_row
) {
// bitonic sort
int col = get_local_id(0);
int row = get_group_id(1);
if (col >= ne00_pad) {
return;
}
src0 = (global char *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
global float * x_row = src0 + row * ne00;
// initialize indices
dst_row[col] = col;
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 2; k <= ne00_pad; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j;
if (ixj > col) {
if ((col & k) == 0) {
if (dst_row[col] >= ne00 ||
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
} else {
if (dst_row[ixj] >= ne00 ||
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// copy the result to dst without the padding
if (col < ne00) {
dst[row * ne00 + col] = dst_row[col];
}
}
+72
View File
@@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_div(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_div_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] / src1[idx1];
}
@@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
// Workgroup must be a subgroup
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_group_norm(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne,
int group_size,
float eps
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int start = get_group_id(0) * group_size;
int end = start + group_size;
start += get_local_id(0);
if (end >= ne) {
end = ne;
}
float tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
tmp += src0[j];
}
tmp = sub_group_reduce_add(tmp);
const float mean = tmp / group_size;
tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
float xi = src0[j] - mean;
dst[j] = xi;
tmp += xi * xi;
}
tmp = sub_group_reduce_add(tmp);
const float variance = tmp / group_size;
const float scale = 1.0f/sqrt(variance + eps);
for (int j = start; j < end; j += get_local_size(0)) {
dst[j] *= scale;
}
}
+29
View File
@@ -0,0 +1,29 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// sigmoid
//------------------------------------------------------------------------------
kernel void kernel_sigmoid_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}
kernel void kernel_sigmoid_f16(
global half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}
+72
View File
@@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_sub(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_sub_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] - src1[idx1];
}
+39
View File
@@ -0,0 +1,39 @@
kernel void kernel_sum_rows_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
}
dst_row[0] = row_sum;
}
+60
View File
@@ -84,6 +84,15 @@ static void gelu_quick(const T *x, T *dst, int k,
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
}
template<typename T>
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
auto x_i = x[i];
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
}
}
template<typename T>
static void tanh(const T *x, T *dst, int k,
const sycl::nd_item<3> &item_ct1) {
@@ -400,6 +409,20 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
});
}
template<typename T>
static void gelu_erf_sycl(const T *x, T *dst, const int k,
queue_ptr stream) {
const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
gelu_erf(x, dst, k, item_ct1);
});
}
template<typename T>
static void tanh_sycl(const T *x, T *dst, const int k,
queue_ptr stream) {
@@ -816,6 +839,38 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
}
}
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
#else
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
#endif
GGML_ASSERT(dst->src[0]->type == dst->type);
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
switch (dst->type) {
#if defined (GGML_SYCL_F16)
case GGML_TYPE_F16:
{
auto data_pts = cast_data<sycl::half>(dst);
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
break;
}
#endif
case GGML_TYPE_F32:
{
auto data_pts = cast_data<float>(dst);
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
break;
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
}
}
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
@@ -1425,6 +1480,11 @@ void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_gelu_quick(ctx, dst);
}
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_gelu_erf(ctx, dst);
}
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_tanh(ctx, dst);
+2
View File
@@ -38,6 +38,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+4
View File
@@ -3543,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_UNARY_OP_GELU_QUICK:
ggml_sycl_gelu_quick(ctx, dst);
break;
case GGML_UNARY_OP_GELU_ERF:
ggml_sycl_gelu_erf(ctx, dst);
break;
case GGML_UNARY_OP_TANH:
ggml_sycl_tanh(ctx, dst);
break;
@@ -4096,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SGN:
-4
View File
@@ -109,10 +109,6 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()
if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()
+69 -16
View File
@@ -1,6 +1,6 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
#ifdef GGML_VULKAN_MEMORY_DEBUG
class vk_memory_logger;
#endif
#ifdef GGML_VULKAN_PERF
class vk_perf_logger;
#endif
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
@@ -442,9 +440,11 @@ struct vk_device_struct {
#ifdef GGML_VULKAN_MEMORY_DEBUG
std::unique_ptr<vk_memory_logger> memory_logger;
#endif
#ifdef GGML_VULKAN_PERF
// for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger;
#endif
vk::QueryPool query_pool;
uint32_t num_queries;
~vk_device_struct() {
VK_LOG_DEBUG("destroy device " << name);
@@ -828,8 +828,6 @@ private:
#define VK_LOG_MEMORY(msg) ((void) 0)
#endif // GGML_VULKAN_MEMORY_DEBUG
#if defined(GGML_VULKAN_PERF)
class vk_perf_logger {
public:
void print_timings() {
@@ -839,7 +837,7 @@ public:
for (const auto& time : t.second) {
total += time;
}
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
}
timings.clear();
@@ -868,7 +866,6 @@ public:
private:
std::map<std::string, std::vector<uint64_t>> timings;
};
#endif // GGML_VULKAN_PERF
struct ggml_backend_vk_context {
std::string name;
@@ -958,6 +955,8 @@ struct vk_instance_t {
static bool vk_instance_initialized = false;
static vk_instance_t vk_instance;
static bool vk_perf_logger_enabled = false;
#ifdef GGML_VULKAN_CHECK_RESULTS
static size_t vk_skip_checks;
static size_t vk_output_tensor;
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
#ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
#endif
#ifdef GGML_VULKAN_PERF
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
#endif
if (vk_perf_logger_enabled) {
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
}
size_t dev_num = vk_instance.device_indices[idx];
@@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -8885,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
ctx->tensor_ctxs[node_idx] = compute_ctx;
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
#if defined(GGML_VULKAN_CHECK_RESULTS)
// Force context reset on each node so that each tensor ends up in its own context
// and can be run and compared to its CPU equivalent separately
last_node = true;
@@ -9505,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
vk_context compute_ctx;
if (vk_perf_logger_enabled) {
// allocate/resize the query pool
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
if (ctx->device->query_pool) {
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
}
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
query_create_info.queryCount = cgraph->n_nodes + 100;
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
ctx->device->num_queries = query_create_info.queryCount;
}
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
@@ -9532,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
}
if (enqueued) {
++submitted_nodes;
@@ -9553,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}
}
#ifdef GGML_VULKAN_PERF
ctx->device->perf_logger->print_timings();
#endif
if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait
GGML_ASSERT(!ctx->compute_ctx.expired());
compute_ctx = ctx->compute_ctx.lock();
ggml_vk_ctx_end(compute_ctx);
ggml_vk_submit(compute_ctx, ctx->device->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
ctx->device->device.resetFences({ ctx->device->fence });
// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
}
}
ctx->device->perf_logger->print_timings();
}
ggml_vk_graph_cleanup(ctx);
+3
View File
@@ -177,6 +177,9 @@ class Keys:
EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
BLOCK_COUNT = "{arch}.convnext.block_count"
class Classifier:
OUTPUT_LABELS = "{arch}.classifier.output_labels"
class Tokenizer:
MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre"
-1
View File
@@ -902,7 +902,6 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"multi_modal_projector.linear_1", # llama 4
),
MODEL_TENSOR.V_MMPROJ_MLP: (
+1 -1
View File
@@ -231,7 +231,7 @@ class SafetensorRemote:
response.raise_for_status()
# Get raw byte data
return response.content[:size]
return response.content[slice(size if size > -1 else None)]
@classmethod
def check_file_exist(cls, url: str) -> bool:
+1 -1
View File
@@ -1 +1 @@
7c06c10c532a6cda913c17fc56341e8880ae341d
06b715f4c170232af261425240914fa49c44f982
+1
View File
@@ -14,6 +14,7 @@ add_library(llama
llama-batch.cpp
llama-chat.cpp
llama-context.cpp
llama-cparams.cpp
llama-grammar.cpp
llama-graph.cpp
llama-hparams.cpp
+2
View File
@@ -174,6 +174,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
+2
View File
@@ -213,6 +213,8 @@ enum llm_kv {
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
LLM_KV_CONVNEXT_BLOCK_COUNT,
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
+18 -13
View File
@@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
}
int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
}
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls);
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
GGML_ASSERT(cls != nullptr);
GGML_ASSERT(cls_b != nullptr);
if (cls != nullptr && cls_b != nullptr) {
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
} else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
}
} break;
default:
+3
View File
@@ -131,6 +131,9 @@ struct llama_hparams {
bool attn_soft_cap = false;
bool use_kq_norm = true;
// for Classifiers
uint32_t n_cls_out = 1;
// llama4
uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4;
+10 -2
View File
@@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;
ggml_tensor * tmp;
+3 -2
View File
@@ -683,6 +683,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
switch (hparams.n_layer) {
case 3:
@@ -2121,8 +2122,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
}
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
-3
View File
@@ -7,7 +7,6 @@
//
#include <exception>
#include <iostream>
#include <json.hpp>
#include <string>
#include "chat-parser.h"
@@ -15,8 +14,6 @@
#include "log.h"
#include "regex-partial.h"
using json = nlohmann::ordered_json;
template <class T>
static void assert_equals(const T & expected, const T & actual) {
if (expected != actual) {
+26 -20
View File
@@ -1,48 +1,54 @@
# mtmd
# compile mtmd-audio separately to avoid long compile times with miniaudio.h
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
target_include_directories(mtmd_audio PRIVATE .)
add_library(mtmd OBJECT
mtmd.cpp
mtmd-helper.cpp
mtmd-audio.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
target_compile_features(mtmd PRIVATE cxx_std_17)
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
add_library(mtmd_helper OBJECT
mtmd-helper.cpp
mtmd-helper.h
)
target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd_helper PUBLIC .)
target_include_directories(mtmd_helper PRIVATE ./vendor)
target_include_directories(mtmd_helper PRIVATE ../..)
target_compile_features(mtmd_helper PRIVATE cxx_std_17)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)
set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_helper_shared LIBRARY)
endif()
if (NOT MSVC)
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
# for stb_image.h and miniaudio.h
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
endif()
if(TARGET BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
add_dependencies(mtmd_helper BUILD_INFO)
endif()
add_executable(llama-llava-cli deprecation-warning.cpp)
@@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
-27
View File
@@ -11,9 +11,6 @@
#include "ggml-backend.h"
#include "gguf.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#include <cassert>
#include <cmath>
#include <cstdlib>
@@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false;
}
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false;
}
clip_build_img_from_pixels(data, nx, ny, img);
stbi_image_free(data);
return true;
}
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
dst.nx = src.nx;
-86
View File
@@ -1,28 +1,5 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd-audio.h"
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "miniaudio.h"
#define _USE_MATH_DEFINES // for M_PI
#include <cmath>
#include <cstdint>
@@ -359,69 +336,6 @@ bool preprocess_audio(
} // namespace whisper_preprocessor
namespace audio_helpers {
bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace wav_utils
// precalculated mel filter banks
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
//
+2 -17
View File
@@ -32,7 +32,7 @@ struct whisper_filters {
std::vector<float> data;
};
extern bool preprocess_audio(
bool preprocess_audio(
const float * samples,
size_t n_samples,
const whisper_filters & filters,
@@ -40,23 +40,8 @@ extern bool preprocess_audio(
} // namespace whisper_preprocessor
// TODO @ngxson : move this helper to mtmd-helpers.cpp
namespace audio_helpers {
extern bool is_audio_file(const char * buf, size_t len);
extern bool decode_audio_from_buf(
const unsigned char * buf_in,
size_t len,
int target_sampler_rate,
std::vector<float> & pcmf32_mono);
} // namespace audio_helpers
namespace whisper_precalc_filters {
extern whisper_preprocessor::whisper_filters get_128_bins();
whisper_preprocessor::whisper_filters get_128_bins();
} // namespace whisper_precalc_filters
+2 -1
View File
@@ -7,6 +7,7 @@
#include "console.h"
#include "chat.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include <vector>
#include <limits.h>
@@ -143,7 +144,7 @@ struct mtmd_cli_context {
}
bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
if (!bmp.ptr) {
return false;
}
+142
View File
@@ -1,10 +1,37 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd.h"
#include "mtmd-helper.h"
#include "llama.h"
#include <algorithm>
#include <cinttypes>
#include <vector>
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "vendor/miniaudio.h"
#define STB_IMAGE_IMPLEMENTATION
#include "vendor/stb_image.h"
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
@@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
return 0;
}
namespace audio_helpers {
static bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace audio_helpers
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
int bitrate = mtmd_get_audio_bitrate(ctx);
if (bitrate < 0) {
LOG_ERR("This model does not support audio input\n");
return nullptr;
}
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
}
// otherwise, we assume it's an image
mtmd_bitmap * result = nullptr;
{
int nx, ny, nc;
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return nullptr;
}
result = mtmd_bitmap_init(nx, ny, data);
stbi_image_free(data);
}
return result;
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
}
+91
View File
@@ -0,0 +1,91 @@
#ifndef MTMD_HELPER_H
#define MTMD_HELPER_H
#include "ggml.h"
#include "llama.h"
#include "mtmd.h"
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
//
// libmtmd helper functions
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
#ifdef __cplusplus
} // extern "C"
#endif
//
// C++ wrappers
//
#endif
+5 -46
View File
@@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
return ctx->ctx_a != nullptr;
}
// these 2 helpers below use internal clip_image_u8_ptr,
// so unfortunately they cannot moved to mtmd-helper.h
// however, in theory, user can decode image file to bitmap using
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
if (!ctx->ctx_a) {
return -1;
}
clip_image_u8_ptr img_u8(clip_image_u8_init());
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
if (!ok) {
LOG_ERR("Unable to load image from buffer\n");
return nullptr;
}
uint32_t nx, ny;
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
return mtmd_bitmap_init(nx, ny, data);
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
// for now, we assume that all audio models have the same bitrate
return 16000; // 16kHz
}
//
+4 -69
View File
@@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
// whether the current model supports audio input
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// get audio bitrate in Hz, for example 16000 for Whisper
// return -1 if audio is not supported
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
// mtmd_bitmap
//
// if bitmap is image:
@@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
/////////////////////////////////////////
//
// Helper functions (can be implemented based on other functions)
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+1 -1
View File
@@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED)
+2 -1
View File
@@ -9,6 +9,7 @@
#include "sampling.h"
#include "speculative.h"
#include "mtmd.h"
#include "mtmd-helper.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
+11 -4
View File
@@ -6,6 +6,7 @@
#include "arg.h" // common_remote_get_content
#include "base64.hpp"
#include "mtmd.h"
#include "mtmd-helper.h"
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) {
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result;
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}
result.reserve(doc.size() + query.size() + 4);
result.push_back(llama_vocab_bos(vocab));
result.insert(result.end(), query.begin(), query.end());
result.push_back(llama_vocab_eos(vocab));
result.push_back(eos_token);
result.push_back(llama_vocab_sep(vocab));
result.insert(result.end(), doc.begin(), doc.end());
result.push_back(llama_vocab_eos(vocab));
result.push_back(eos_token);
return result;
}
@@ -573,7 +580,7 @@ struct oaicompat_parser_options {
// used by /chat/completions endpoint
static json oaicompat_chat_params_parse(
const json & body, /* openai api json semantics */
json & body, /* openai api json semantics */
const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files)
{
@@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse(
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
}
json messages = body.at("messages");
json & messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
}