mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a5b8aff40 | |||
| d2d626938a | |||
| 2fc392ce35 | |||
| ece0f5c177 | |||
| 7bef684118 | |||
| 395e286bc9 | |||
| 13730c183b | |||
| 967eb4b2bf | |||
| f117be185e | |||
| 85234a4b3a | |||
| 0c74f32632 | |||
| c27efd2bd1 | |||
| df70bedda7 | |||
| f914544b16 | |||
| 4b13a684c5 | |||
| 9898b57cbe | |||
| 1032256ec9 | |||
| 15274c0c50 | |||
| b8595b16e6 | |||
| 392e09a608 | |||
| 802cef44bf | |||
| 1c07c0c68c |
@@ -34,6 +34,7 @@
|
||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
useRpc ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||
@@ -175,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
(cmakeBool "GGML_METAL" useMetalKit)
|
||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||
(cmakeBool "GGML_STATIC" enableStatic)
|
||||
(cmakeBool "GGML_RPC" useRpc)
|
||||
]
|
||||
++ optionals useCuda [
|
||||
(
|
||||
|
||||
@@ -60,3 +60,11 @@ end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[benches/**]
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"chars": 2296.1916666666666,
|
||||
"chars:std": 986.051306946325,
|
||||
"score": 0.925,
|
||||
"score:std": 0.26339134382131846
|
||||
}
|
||||
+2896
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -2253,6 +2253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.is_pp_shared = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"-tgs"},
|
||||
string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.is_tg_separate = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"-npp"}, "n0,n1,...",
|
||||
"number of prompt tokens",
|
||||
|
||||
+2
-1
@@ -460,7 +460,8 @@ struct common_params {
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
bool is_pp_shared = false;
|
||||
bool is_tg_separate = false;
|
||||
|
||||
std::vector<int32_t> n_pp;
|
||||
std::vector<int32_t> n_tg;
|
||||
|
||||
+103
-16
@@ -218,8 +218,7 @@ class ModelBase:
|
||||
logger.info(f"gguf: indexing model part '{part_name}'")
|
||||
ctx: ContextManager[Any]
|
||||
if is_safetensors:
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
|
||||
@@ -228,18 +227,18 @@ class ModelBase:
|
||||
|
||||
for name in model_part.keys():
|
||||
if is_safetensors:
|
||||
data: gguf.utility.LocalTensor = model_part[name]
|
||||
if self.lazy:
|
||||
data = model_part.get_slice(name)
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data) # noqa: E731
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
|
||||
else:
|
||||
data = model_part.get_tensor(name)
|
||||
data_gen = lambda data=data: data # noqa: E731
|
||||
dtype = LazyTorchTensor._dtype_str_map[data.dtype]
|
||||
data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
|
||||
else:
|
||||
data = model_part[name]
|
||||
data_torch: Tensor = model_part[name]
|
||||
if self.lazy:
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_eager(data) # noqa: E731
|
||||
data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
|
||||
else:
|
||||
data_gen = lambda data=data: data # noqa: E731
|
||||
data_gen = lambda data=data_torch: data # noqa: E731
|
||||
tensors[name] = data_gen
|
||||
|
||||
# verify tensor name presence and identify potentially missing files
|
||||
@@ -278,15 +277,14 @@ class ModelBase:
|
||||
# The scale is inverted
|
||||
return data / scale.float()
|
||||
|
||||
def dequant_simple(weight: Tensor, scale: Tensor) -> Tensor:
|
||||
def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
|
||||
scale = scale.float()
|
||||
|
||||
if (weight_block_size := quant_config.get("weight_block_size")):
|
||||
# TODO: make sure it's a list of integers
|
||||
for i, size in enumerate(weight_block_size):
|
||||
if block_size is not None:
|
||||
for i, size in enumerate(block_size):
|
||||
scale = scale.repeat_interleave(size, i)
|
||||
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
|
||||
scale = scale[tuple(slice(0, size) for size in weight.shape)]
|
||||
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
|
||||
scale = scale[tuple(slice(0, size) for size in weight.shape)]
|
||||
|
||||
return weight.float() * scale
|
||||
|
||||
@@ -333,6 +331,40 @@ class ModelBase:
|
||||
|
||||
return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
|
||||
|
||||
def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
|
||||
assert w.dtype == torch.int32
|
||||
shape = tuple(shape_tensor.tolist())
|
||||
assert len(shape) == 2
|
||||
mask = (1 << num_bits) - 1
|
||||
|
||||
shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
|
||||
if self.lazy:
|
||||
shifts = LazyTorchTensor.from_eager(shifts)
|
||||
|
||||
if zero_point is None:
|
||||
offset = 1 << (num_bits - 1)
|
||||
else:
|
||||
assert len(zero_point.shape) == 2
|
||||
offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
|
||||
offset = offset.reshape(-1, zero_point.shape[1])
|
||||
# trim padding, and prepare for broadcast
|
||||
# NOTE: the zero-point is packed along dim 0
|
||||
offset = offset[:shape[0], :].unsqueeze(-1)
|
||||
|
||||
# extract values
|
||||
# NOTE: the weights are packed along dim 1
|
||||
unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
|
||||
unpacked = unpacked.reshape(shape[0], -1)
|
||||
|
||||
# trim padding
|
||||
unpacked = unpacked[:, :shape[1]]
|
||||
|
||||
# prepare for broadcast of the scale
|
||||
unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
|
||||
unpacked = unpacked - offset
|
||||
|
||||
return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
|
||||
|
||||
if quant_method == "bitnet":
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
@@ -342,12 +374,13 @@ class ModelBase:
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "fp8":
|
||||
block_size = quant_config.get("weight_block_size")
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale_inv"):
|
||||
weight_name = name.removesuffix("_scale_inv")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s())
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
for name in self.model_tensors.keys():
|
||||
@@ -371,6 +404,49 @@ class ModelBase:
|
||||
".scales",
|
||||
)
|
||||
]
|
||||
elif quant_method == "compressed-tensors":
|
||||
quant_format = quant_config["format"]
|
||||
groups = quant_config["config_groups"]
|
||||
if len(groups) > 1:
|
||||
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
|
||||
weight_config = tuple(groups.values())[0]["weights"]
|
||||
|
||||
if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
|
||||
block_size = weight_config.get("block_structure", None)
|
||||
strategy = weight_config.get("strategy")
|
||||
assert strategy == "channel" or strategy == "block"
|
||||
assert weight_config.get("group_size") is None # didn't find a model using this yet
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_format == "pack-quantized":
|
||||
assert weight_config.get("strategy") == "group"
|
||||
assert weight_config.get("type", "int") == "int"
|
||||
num_bits = weight_config.get("num_bits")
|
||||
group_size = weight_config.get("group_size")
|
||||
assert isinstance(num_bits, int)
|
||||
assert isinstance(group_size, int)
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_packed"):
|
||||
base_name = name.removesuffix("_packed")
|
||||
w = self.model_tensors[name]
|
||||
scale = self.model_tensors[base_name + "_scale"]
|
||||
shape = self.model_tensors[base_name + "_shape"]
|
||||
zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
|
||||
new_tensors[base_name] = (
|
||||
lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
|
||||
w(), scale(), shape(), zero_point(), num_bits, group_size,
|
||||
)
|
||||
)
|
||||
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
|
||||
if (base_name + "_zero_point") in self.model_tensors:
|
||||
tensors_to_remove.append(base_name + "_zero_point")
|
||||
else:
|
||||
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
||||
else:
|
||||
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
|
||||
|
||||
@@ -7278,6 +7354,7 @@ class PLMModel(TextModel):
|
||||
@ModelBase.register("T5ForConditionalGeneration")
|
||||
@ModelBase.register("MT5ForConditionalGeneration")
|
||||
@ModelBase.register("UMT5ForConditionalGeneration")
|
||||
@ModelBase.register("UMT5Model")
|
||||
class T5Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.T5
|
||||
|
||||
@@ -10002,6 +10079,16 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
|
||||
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
|
||||
dtype = cls._dtype_str_map[tensor.dtype]
|
||||
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
|
||||
dtype = cls._dtype_str_map[t.dtype]
|
||||
shape = t.shape
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
|
||||
dtype = cls._dtype_str_map[remote_tensor.dtype]
|
||||
|
||||
@@ -211,6 +211,11 @@ add_library(ggml-base
|
||||
ggml-quants.h
|
||||
gguf.cpp)
|
||||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
if (GGML_BACKEND_DL)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
|
||||
@@ -220,6 +225,11 @@ add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
set_target_properties(ggml PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
|
||||
if (GGML_BACKEND_DIR)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
||||
@@ -259,6 +269,12 @@ function(ggml_add_backend_library backend)
|
||||
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
|
||||
endif()
|
||||
|
||||
# Set versioning properties for all backend libraries
|
||||
set_target_properties(${backend} PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
)
|
||||
|
||||
if(NOT GGML_AVAILABLE_BACKENDS)
|
||||
set(GGML_AVAILABLE_BACKENDS "${backend}"
|
||||
CACHE INTERNAL "List of backends for cmake package")
|
||||
|
||||
@@ -126,25 +126,36 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
)
|
||||
if (NOT ARM_MCPU_RESULT)
|
||||
string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
|
||||
string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
|
||||
|
||||
# on some old GCC we need to read -march=
|
||||
if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
|
||||
set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
|
||||
elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
|
||||
set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
|
||||
endif()
|
||||
endif()
|
||||
if ("${ARM_MCPU_FLAG}" STREQUAL "")
|
||||
set(ARM_MCPU_FLAG -mcpu=native)
|
||||
message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
|
||||
|
||||
if ("${ARM_NATIVE_FLAG}" STREQUAL "")
|
||||
set(ARM_NATIVE_FLAG -mcpu=native)
|
||||
message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
|
||||
else()
|
||||
message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
|
||||
endif()
|
||||
|
||||
include(CheckCXXSourceRuns)
|
||||
|
||||
function(check_arm_feature tag code)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
|
||||
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
|
||||
if (GGML_MACHINE_SUPPORTS_${tag})
|
||||
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
|
||||
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}" PARENT_SCOPE)
|
||||
else()
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
|
||||
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
|
||||
if (GGML_MACHINE_SUPPORTS_no${tag})
|
||||
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
|
||||
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}" PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
@@ -155,7 +166,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
|
||||
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
|
||||
|
||||
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
|
||||
list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
|
||||
@@ -2044,6 +2044,26 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
}
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
static inline svuint32_t ggml_decode_q4scales_and_mins_for_mmla(const uint32_t * vx_scales) {
|
||||
const svbool_t pg_all = svptrue_pat_b32(SV_VL4);
|
||||
const svbool_t pg_false = svpfalse_b(); // 0x0000
|
||||
const svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8); // 0x00ff
|
||||
const svbool_t pg_odd = svzip1_b32(pg_false, pg_lo_8);
|
||||
|
||||
svuint32_t vutmp_hi, vutmp_lo;
|
||||
svuint32_t vx01 = svld1_u32(pg_lo_8, vx_scales);
|
||||
vutmp_hi = svzip1_u32(vx01, vx01);
|
||||
vutmp_hi = svlsr_n_u32_m(pg_odd, vutmp_hi, 2);
|
||||
vutmp_hi = svreinterpret_u32_u64(svand_n_u64_x(pg_all, svreinterpret_u64_u32(vutmp_hi), UINT64_C(0x303030303f3f3f3f)));
|
||||
const svuint32_t vx2 = svdup_u32(vx_scales[2]);
|
||||
vutmp_lo = svlsr_u32_x(pg_all, vx2, svreinterpret_u32_s32(svindex_s32(-2, 2)));
|
||||
vutmp_lo = svand_n_u32_z(pg_odd, vutmp_lo, UINT32_C(0x0f0f0f0f));
|
||||
svuint32_t vutmp = svorr_u32_z(pg_all, vutmp_hi, vutmp_lo);
|
||||
return vutmp;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||
@@ -2066,8 +2086,220 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
static const uint32_t kmask3 = 0x03030303;
|
||||
|
||||
uint32_t utmp[4];
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
|
||||
|
||||
const block_q4_K * GGML_RESTRICT vx0 = vx;
|
||||
const block_q8_K * GGML_RESTRICT vy0 = vy;
|
||||
const block_q4_K * GGML_RESTRICT vx1 = (const block_q4_K *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
|
||||
|
||||
union {
|
||||
uint32_t u32[8];
|
||||
uint64_t u64[4];
|
||||
} new_utmp;
|
||||
|
||||
svfloat32_t sumf1 = svdup_n_f32(0);
|
||||
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
svbool_t pg_false = svpfalse_b();
|
||||
svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8);
|
||||
svbool_t vmins_mask1= svzip1_b32(pg_lo_8, pg_false);
|
||||
svbool_t vmins_mask2 = svzip1_b32(pg_false, pg_lo_8);
|
||||
svbool_t pg128_all = svptrue_pat_b8(SV_VL16);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
|
||||
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
|
||||
svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
|
||||
svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
|
||||
svfloat32_t vy_dmins = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
|
||||
svfloat32_t svdmins = svmul_n_f32_x(pg128_all, svmul_f32_x(pg128_all, vy_dmins, vx_dmins), -1);
|
||||
const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8_0 = vy0[i].qs;
|
||||
const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8_1 = vy1[i].qs;
|
||||
svint16_t lo = svld1_s16(pg128_all, vy0[i].bsums + 0);
|
||||
svint16_t hi = svld1_s16(pg128_all, vy0[i].bsums + 8);
|
||||
svint16_t sum_tmp1 = svuzp1_s16(lo, hi);
|
||||
svint16_t sum_tmp2 = svuzp2_s16(lo, hi);
|
||||
svint16_t svq8sums_0 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
|
||||
lo = svld1_s16(pg128_all, vy1[i].bsums + 0);
|
||||
hi = svld1_s16(pg128_all, vy1[i].bsums + 8);
|
||||
sum_tmp1 = svuzp1(lo, hi);
|
||||
sum_tmp2 = svuzp2(lo, hi);
|
||||
svint16_t svq8sums_1 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
|
||||
svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
|
||||
svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
|
||||
svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
|
||||
svst2_u32(pg128_all, new_utmp.u32, decoded_scales);
|
||||
svint16_t svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp1_u32(svld1_u32(vmins_mask1, new_utmp.u32+4), svdup_n_u32(0)))));
|
||||
svint16_t svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp2_u32(svld1_u32(vmins_mask2, new_utmp.u32+4), svdup_n_u32(0)))));
|
||||
svint32_t svsumfs_tmp1 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_0));
|
||||
svint32_t svsumfs_tmp2 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_1));
|
||||
svint32_t svsumfs_tmp3 = svtrn1_s32(svsumfs_tmp1, svsumfs_tmp2);
|
||||
svint32_t svsumfs_tmp4 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_0));
|
||||
svint32_t svsumfs_tmp5 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_1));
|
||||
svint32_t svsumfs_tmp6 = svtrn1_s32(svsumfs_tmp4, svsumfs_tmp5);
|
||||
svint32_t svsumfs_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
|
||||
svint32_t svsumfs_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
|
||||
svint32_t svsumfs_tmp = svadd_s32_x(pg128_all, svsumfs_tmp7, svsumfs_tmp8);
|
||||
svint32_t svscales, sumi1, sumi2;
|
||||
svint32_t acc_sumif1 = svdup_n_s32(0);
|
||||
svint32_t acc_sumif2 = svdup_n_s32(0);
|
||||
svint8_t q4bytes_0_l, q4bytes_0_h, q4bytes_1_l, q4bytes_1_h, l0, l1, l2, l3,
|
||||
q8bytes_0_h, q8bytes_0_l, q8bytes_1_h, q8bytes_1_l, r0, r1, r2, r3;
|
||||
#pragma GCC unroll 1
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
q4bytes_0_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 0xf));
|
||||
q4bytes_1_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 0xf));
|
||||
q4bytes_0_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 0xf));
|
||||
q4bytes_1_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 0xf));
|
||||
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
|
||||
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
|
||||
l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
|
||||
l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
|
||||
q8bytes_0_h = svld1_s8(pg128_all, q8_0);
|
||||
q8bytes_1_h = svld1_s8(pg128_all, q8_1);
|
||||
q8bytes_0_l = svld1_s8(pg128_all, q8_0+16);
|
||||
q8bytes_1_l = svld1_s8(pg128_all, q8_1+16);
|
||||
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
|
||||
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
|
||||
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
|
||||
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
|
||||
sumi1 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
|
||||
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
|
||||
acc_sumif1 = svmla_s32_x(pg128_all, acc_sumif1, svscales, sumi1);
|
||||
|
||||
q4bytes_0_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 4));
|
||||
q4bytes_1_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 4));
|
||||
q4bytes_0_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 4));
|
||||
q4bytes_1_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 4));
|
||||
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
|
||||
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
|
||||
l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
|
||||
l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
|
||||
q8bytes_0_h = svld1_s8(pg128_all, q8_0+32);
|
||||
q8bytes_1_h = svld1_s8(pg128_all, q8_1+32);
|
||||
q8bytes_0_l = svld1_s8(pg128_all, q8_0+48);
|
||||
q8bytes_1_l = svld1_s8(pg128_all, q8_1+48);
|
||||
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
|
||||
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
|
||||
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
|
||||
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
|
||||
sumi2 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
|
||||
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
|
||||
acc_sumif2 = svmla_s32_x(pg128_all, acc_sumif2, svscales, sumi2);
|
||||
q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
|
||||
}
|
||||
sumf1 = svmla_f32_x(pg128_all,
|
||||
svmla_f32_x(pg128_all,
|
||||
sumf1,
|
||||
svcvt_f32_x(pg128_all,
|
||||
svadd_s32_x(pg128_all, acc_sumif1, acc_sumif2)),
|
||||
svsuper_block_scales),
|
||||
svdmins,
|
||||
svcvt_f32_s32_x(pg128_all, svsumfs_tmp));
|
||||
} //end of for nb
|
||||
} // end of case 128
|
||||
break;
|
||||
case 256:
|
||||
case 512:
|
||||
{
|
||||
const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
|
||||
const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8_0 = vy0[i].qs;
|
||||
const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8_1 = vy1[i].qs;
|
||||
svint32_t svscales, sumi1, sumi2;
|
||||
svint32_t acc_sumif1 = svdup_n_s32(0);
|
||||
svint32_t acc_sumif2 = svdup_n_s32(0);
|
||||
svint8_t l0, l1, l2, l3, r0, r1, r2, r3;
|
||||
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
|
||||
svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
|
||||
svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
|
||||
svfloat32_t svsuper_block_scales = svmul_f32_z(pg32_4, vy_d, vx_d);
|
||||
svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
|
||||
svfloat64_t vy_dmins_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
|
||||
svfloat32_t vy_dmins = svreinterpret_f32_f64(svuzp1_f64(vy_dmins_tmp, vy_dmins_tmp));
|
||||
svfloat32_t svdmins = svmul_n_f32_x(pg32_4, svmul_f32_x(pg32_4, vx_dmins, vy_dmins), -1);
|
||||
svint16_t rc1 = svuzp1_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
|
||||
svint16_t rc2 = svuzp2_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
|
||||
svint16_t svq8sums = svadd_s16_x(pg256_all, rc1, rc2);
|
||||
svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
|
||||
svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
|
||||
svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
|
||||
svst2_u32(pg8_16, new_utmp.u32, decoded_scales);
|
||||
svint16_t new_svq8sums_0 = svreinterpret_s16_u64(svtrn1_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
|
||||
svint16_t new_svq8sums_1 = svreinterpret_s16_u64(svtrn2_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
|
||||
svuint64_t new_mins_0 = svdup_u64(new_utmp.u64[2]);
|
||||
svuint64_t new_mins_1 = svdup_u64(new_utmp.u64[3]);
|
||||
svint16_t new_svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_0)));
|
||||
svint16_t new_svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_1)));
|
||||
svint64_t dot_prod_0 = svdot_s64(svdup_s64(0), new_svmins8_0, new_svq8sums_0);
|
||||
svint64_t dot_prod_1 = svdot_s64(dot_prod_0, new_svmins8_1, new_svq8sums_1);
|
||||
svfloat32_t converted_dot_prod_1 = svcvt_f32_s64_x(pg256_all, dot_prod_1);
|
||||
svfloat32_t svsumfs_tmp = svuzp1_f32(converted_dot_prod_1, converted_dot_prod_1);
|
||||
|
||||
#pragma GCC unroll 1
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
svuint8_t q4bytes_0 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 0xf);
|
||||
svuint8_t q4bytes_1 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 0xf);
|
||||
svuint8_t q4bytes_2 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 4);
|
||||
svuint8_t q4bytes_3 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 4);
|
||||
l0 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
|
||||
l1 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
|
||||
l2 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
|
||||
l3 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
|
||||
svint8_t q8bytes_0 = svld1_s8(pg256_all, q8_0);
|
||||
svint8_t q8bytes_1 = svld1_s8(pg256_all, q8_1);
|
||||
svint8_t q8bytes_2 = svld1_s8(pg256_all, q8_0+32);
|
||||
svint8_t q8bytes_3 = svld1_s8(pg256_all, q8_1+32);
|
||||
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
|
||||
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
|
||||
sumi1 = svmmla(svmmla(svdup_n_s32(0), r0, l0), r1, l1);
|
||||
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
|
||||
acc_sumif1 = svmla_s32_x(pg256_all, acc_sumif1, svscales, sumi1);
|
||||
sumi2 = svmmla(svmmla(svdup_n_s32(0), r2, l2), r3, l3);
|
||||
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
|
||||
acc_sumif2 = svmla_s32_x(pg256_all, acc_sumif2, svscales, sumi2);
|
||||
q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
|
||||
}
|
||||
svint32_t acc_sumif = svadd_s32_x(pg256_all, acc_sumif1, acc_sumif2);
|
||||
svint32_t swap_acc_sumif = svext_s32(acc_sumif, acc_sumif, 4);
|
||||
acc_sumif = svadd_s32_x(pg32_4, acc_sumif, swap_acc_sumif);
|
||||
sumf1 = svmla_f32_x(pg32_4,
|
||||
svmla_f32_x(pg32_4,
|
||||
sumf1,
|
||||
svcvt_f32_x(pg32_4, acc_sumif),
|
||||
svsuper_block_scales),
|
||||
svdmins,
|
||||
svsumfs_tmp);
|
||||
} // end of for nb
|
||||
} // end of case 256-512
|
||||
break;
|
||||
default:
|
||||
assert(false && "Unsupported vector length");
|
||||
break;
|
||||
}
|
||||
|
||||
svst1_f32(pg32_2, s, sumf1);
|
||||
svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sumf1), svdup_n_u8(0), 8)));
|
||||
|
||||
return;
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_K * GGML_RESTRICT x0 = x;
|
||||
const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
|
||||
@@ -2235,7 +2467,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
const svuint8_t m4b = svdup_n_u8(0xf);
|
||||
const svint32_t mzero = svdup_n_s32(0);
|
||||
svint32_t sumi1 = svdup_n_s32(0);
|
||||
@@ -2480,7 +2711,201 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
|
||||
|
||||
svfloat32_t sum = svdup_n_f32(0);
|
||||
|
||||
const block_q6_K * GGML_RESTRICT vx0 = vx;
|
||||
const block_q8_K * GGML_RESTRICT vy0 = vy;
|
||||
const block_q6_K * GGML_RESTRICT vx1 = (const block_q6_K *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
|
||||
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
const svbool_t pg128_all = svptrue_pat_b8(SV_ALL);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
|
||||
const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
|
||||
const int8_t * GGML_RESTRICT q80 = vy0[i].qs;
|
||||
const int8_t * GGML_RESTRICT q81 = vy1[i].qs;
|
||||
|
||||
const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
|
||||
const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
|
||||
|
||||
svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
|
||||
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
|
||||
svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
|
||||
// process q8sum summation 128 bit route
|
||||
const svint16_t q8sums_01 = svld1_s16(pg128_all, vy0[i].bsums);
|
||||
const svint16_t q8sums_02 = svld1_s16(pg128_all, vy0[i].bsums + 8);
|
||||
const svint16_t q8sums_11 = svld1_s16(pg128_all, vy1[i].bsums);
|
||||
const svint16_t q8sums_12 = svld1_s16(pg128_all, vy1[i].bsums + 8);
|
||||
const svint64x2_t q6scales_0_tmp = svld2_s64(pg128_all, (const int64_t *)scale0);
|
||||
const svint16_t q6scales_01 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 0)));
|
||||
const svint16_t q6scales_02 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 1)));
|
||||
const svint64x2_t q6scales_1_tmp = svld2_s64(pg128_all, (const int64_t *)scale1);
|
||||
const svint16_t q6scales_11 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 0)));
|
||||
const svint16_t q6scales_12 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 1)));
|
||||
const svint64_t prod = svdup_n_s64(0);
|
||||
|
||||
svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_01), q8sums_02, q6scales_02));
|
||||
svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_11), q8sums_02, q6scales_12));
|
||||
svint32_t isum_tmp3 = svtrn1_s32(isum_tmp1, isum_tmp2);
|
||||
svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_01), q8sums_12, q6scales_02));
|
||||
svint32_t isum_tmp5 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_11), q8sums_12, q6scales_12));
|
||||
svint32_t isum_tmp6 = svtrn1_s32(isum_tmp4, isum_tmp5);
|
||||
svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
|
||||
svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
|
||||
svint32_t svisum_mins = svadd_s32_x(pg128_all, isum_tmp7, isum_tmp8);
|
||||
|
||||
// process mmla
|
||||
svint8_t l0, l1, r0, r1;
|
||||
svint32_t isum_tmp = svdup_n_s32(0);
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
svuint8_t qhbits_0 = svld1_u8(pg128_all, qh0+16*(k%2));
|
||||
svuint8_t qhbits_1 = svld1_u8(pg128_all, qh1+16*(k%2));
|
||||
svuint8_t q6bits_0 = svld1_u8(pg128_all, ql0+16*(k%4));
|
||||
svuint8_t q6bits_1 = svld1_u8(pg128_all, ql1+16*(k%4));
|
||||
const int ql_pos = (k/4)*4;
|
||||
svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_0, 4);
|
||||
svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_1, 4);
|
||||
const int qh_pos = (k/2)*2;
|
||||
svuint8_t q6bytes_0_hi = svand_n_u8_x(pg128_all, qhbits_0, 0x3 << qh_pos);
|
||||
svuint8_t q6bytes_1_hi = svand_n_u8_x(pg128_all, qhbits_1, 0x3 << qh_pos);
|
||||
svint8_t q6bytes_0, q6bytes_1;
|
||||
if (qh_pos <= 4) {
|
||||
q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
|
||||
q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
|
||||
} else {
|
||||
q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_0_lo, svlsr_n_u8_x(pg128_all, q6bytes_0_hi, (qh_pos - 4))));
|
||||
q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_1_lo, svlsr_n_u8_x(pg128_all, q6bytes_1_hi, (qh_pos - 4))));
|
||||
}
|
||||
svint8_t q8bytes_0 = svld1_s8(pg128_all, q80+16*(k%8));
|
||||
svint8_t q8bytes_1 = svld1_s8(pg128_all, q81+16*(k%8));
|
||||
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
|
||||
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
|
||||
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
svint32_t svscale = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
|
||||
isum_tmp = svmla_s32_x(pg128_all, isum_tmp, svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), svscale);
|
||||
}
|
||||
qh0 += 32; qh1 += 32;
|
||||
ql0 += 64; ql1 += 64;
|
||||
q80 += 128; q81 += 128;
|
||||
scale0 += 8; scale1 += 8;
|
||||
}
|
||||
sum = svmla_f32_x(pg128_all, sum,
|
||||
svcvt_f32_x(pg128_all, svmla_s32_x(pg128_all, isum_tmp,
|
||||
svisum_mins, svdup_n_s32(-32))),
|
||||
svsuper_block_scales);
|
||||
}
|
||||
} // end of case 128
|
||||
break;
|
||||
case 256:
|
||||
case 512:
|
||||
{
|
||||
const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
|
||||
const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
|
||||
const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
|
||||
const int8_t * GGML_RESTRICT q80 = vy0[i].qs;
|
||||
const int8_t * GGML_RESTRICT q81 = vy1[i].qs;
|
||||
|
||||
const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
|
||||
const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
|
||||
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
|
||||
svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
|
||||
svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
|
||||
svfloat32_t svsuper_block_scales = svmul_f32_x(pg32_4, vy_d, vx_d);
|
||||
// process q8sum summation 256 bit route
|
||||
const svint16_t q8sums_0 = svld1_s16(pg256_all, vy0[i].bsums);
|
||||
const svint16_t q8sums_1 = svld1_s16(pg256_all, vy1[i].bsums);
|
||||
const svint16_t q6scales_0 = svunpklo_s16(svld1_s8(pg256_all, scale0));
|
||||
const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(pg256_all, scale1));
|
||||
const svint64_t prod = svdup_n_s64(0);
|
||||
svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_0));
|
||||
svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_1));
|
||||
svint32_t isum_tmp3 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_0));
|
||||
svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_1));
|
||||
svint32_t isum_tmp5 = svtrn1_s32(isum_tmp1, isum_tmp2);
|
||||
svint32_t isum_tmp6 = svtrn1_s32(isum_tmp3, isum_tmp4);
|
||||
svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
|
||||
svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
|
||||
svint32_t isum_tmp9 = svadd_s32_x(pg256_all, isum_tmp7, isum_tmp8);
|
||||
svint32_t isum_tmp10 = svreinterpret_s32_u8(svext_u8(svreinterpret_u8_s32(isum_tmp9), svreinterpret_u8_s32(isum_tmp9), 16));
|
||||
svint32_t svisum_mins = svadd_s32_z(pg32_4, isum_tmp9, isum_tmp10);
|
||||
|
||||
// process mmla
|
||||
svint8_t l0, l1, r0, r1;
|
||||
svint32_t isum_tmp = svdup_n_s32(0);
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
for (int k = 0; k < 8; k+=2) { // process 2 block
|
||||
svuint8_t qhbits_0 = svld1_u8(pg256_all, qh0);
|
||||
svuint8_t qhbits_1 = svld1_u8(pg256_all, qh1);
|
||||
svuint8_t q6bits_0 = svld1_u8(pg256_all, ql0+32*((k%4)/2));
|
||||
svuint8_t q6bits_1 = svld1_u8(pg256_all, ql1+32*((k%4)/2));
|
||||
const int ql_pos = (k/4)*4;
|
||||
svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_0, 4);
|
||||
svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_1, 4);
|
||||
const int qh_pos = (k/2)*2;
|
||||
svuint8_t q6bytes_0_hi = svand_n_u8_x(pg256_all, qhbits_0, 0x3 << qh_pos);
|
||||
svuint8_t q6bytes_1_hi = svand_n_u8_x(pg256_all, qhbits_1, 0x3 << qh_pos);
|
||||
svint8_t q6bytes_0, q6bytes_1;
|
||||
if (qh_pos <= 4) {
|
||||
q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
|
||||
q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
|
||||
} else {
|
||||
q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_0_lo, svlsr_n_u8_x(pg256_all, q6bytes_0_hi, (qh_pos - 4))));
|
||||
q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_1_lo, svlsr_n_u8_x(pg256_all, q6bytes_1_hi, (qh_pos - 4))));
|
||||
}
|
||||
svint8_t q8bytes_0 = svld1_s8(pg256_all, q80+32*(k/2));
|
||||
svint8_t q8bytes_1 = svld1_s8(pg256_all, q81+32*(k/2));
|
||||
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
|
||||
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
|
||||
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
|
||||
svint32_t svscale0 = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
|
||||
svint32_t svscale1 = svzip1_s32(svdup_n_s32(scale0[k+1]), svdup_n_s32(scale1[k+1]));
|
||||
isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r0, l0), svscale0);
|
||||
isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r1, l1), svscale1);
|
||||
}
|
||||
qh0 += 32; qh1 += 32;
|
||||
ql0 += 64; ql1 += 64;
|
||||
q80 += 128; q81 += 128;
|
||||
scale0 += 8; scale1 += 8;
|
||||
} // end of for
|
||||
svint32_t swap_isum_tmp = svext_s32(isum_tmp, isum_tmp, 4);
|
||||
isum_tmp = svadd_s32_x(pg32_4, isum_tmp, swap_isum_tmp);
|
||||
sum = svmla_f32_x(pg32_4, sum,
|
||||
svcvt_f32_x(pg32_4, svmla_s32_x(pg32_4, isum_tmp,
|
||||
svisum_mins, svdup_n_s32(-32))),
|
||||
svsuper_block_scales);
|
||||
}
|
||||
} // end of case 256
|
||||
break;
|
||||
default:
|
||||
assert(false && "Unsupported vector length");
|
||||
break;
|
||||
} // end of switch
|
||||
|
||||
svst1_f32(pg32_2, s, sum);
|
||||
svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sum), svdup_n_u8(0), 8)));
|
||||
|
||||
return;
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q6_K * GGML_RESTRICT x0 = x;
|
||||
const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
|
||||
@@ -2594,27 +3019,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// adjust bias, apply superblock scale
|
||||
{
|
||||
int32_t bias[4];
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
|
||||
const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
|
||||
const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
|
||||
const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
|
||||
const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
|
||||
const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
|
||||
const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
|
||||
const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
|
||||
const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
|
||||
const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
|
||||
const svint64_t zero = svdup_n_s64(0);
|
||||
bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
|
||||
svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
|
||||
bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
|
||||
svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
|
||||
bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
|
||||
svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
|
||||
bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
|
||||
svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
|
||||
#else
|
||||
// NEON doesn't support int16 dot product, fallback to separated mul and add
|
||||
const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
|
||||
const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
|
||||
@@ -2646,7 +3050,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
|
||||
bias[3] = vaddvq_s32(prod);
|
||||
|
||||
#endif
|
||||
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
|
||||
|
||||
const float32x4_t superblock_scale = {
|
||||
@@ -2672,7 +3075,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
float sum = 0;
|
||||
svuint8_t m4b = svdup_n_u8(0xf);
|
||||
svint32_t vzero = svdup_n_s32(0);
|
||||
|
||||
@@ -1807,22 +1807,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_cont(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_RESHAPE:
|
||||
{
|
||||
ggml_compute_forward_reshape(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_VIEW:
|
||||
{
|
||||
ggml_compute_forward_view(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_PERMUTE:
|
||||
{
|
||||
ggml_compute_forward_permute(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_TRANSPOSE:
|
||||
{
|
||||
ggml_compute_forward_transpose(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
ggml_compute_forward_get_rows(params, tensor);
|
||||
@@ -2042,6 +2026,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
// nop
|
||||
} break;
|
||||
case GGML_OP_RESHAPE:
|
||||
{
|
||||
// nop
|
||||
} break;
|
||||
case GGML_OP_PERMUTE:
|
||||
{
|
||||
// nop
|
||||
} break;
|
||||
case GGML_OP_VIEW:
|
||||
{
|
||||
// nop
|
||||
} break;
|
||||
case GGML_OP_TRANSPOSE:
|
||||
{
|
||||
// nop
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -2884,6 +2884,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
// skip NOPs
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
|
||||
@@ -4455,46 +4455,6 @@ void ggml_compute_forward_cont(
|
||||
ggml_compute_forward_dup(params, dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_reshape
|
||||
|
||||
void ggml_compute_forward_reshape(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
// NOP
|
||||
GGML_UNUSED(params);
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_view
|
||||
|
||||
void ggml_compute_forward_view(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
// NOP
|
||||
GGML_UNUSED(params);
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_permute
|
||||
|
||||
void ggml_compute_forward_permute(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
// NOP
|
||||
GGML_UNUSED(params);
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_transpose
|
||||
|
||||
void ggml_compute_forward_transpose(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
// NOP
|
||||
GGML_UNUSED(params);
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_get_rows
|
||||
|
||||
static void ggml_compute_forward_get_rows_q(
|
||||
|
||||
@@ -51,10 +51,6 @@ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struc
|
||||
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
@@ -81,6 +81,70 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst,
|
||||
dst[index] = result;
|
||||
}
|
||||
|
||||
namespace bicubic_interpolation {
|
||||
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||
__device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
|
||||
|
||||
static __device__ float weight1(float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
|
||||
static __device__ float weight2(float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
|
||||
|
||||
static __device__ float bicubic(float p0, float p1, float p2, float p3, float x) {
|
||||
const float w0 = weight2(x + 1);
|
||||
const float w1 = weight1(x + 0);
|
||||
const float w2 = weight1(1 - x);
|
||||
const float w3 = weight2(2 - x);
|
||||
return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
|
||||
};
|
||||
} // namespace bicubic_interpolation
|
||||
|
||||
static __global__ void upscale_f32_bicubic(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset) {
|
||||
using bicubic_interpolation::bicubic;
|
||||
|
||||
const int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i10_dst = index % ne10_dst;
|
||||
const int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
const int i02_src = (int)(i12_dst / sf2);
|
||||
const int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
const int y0_src = (int)floorf(y_src_f);
|
||||
const float dy = y_src_f - (float)y0_src;
|
||||
|
||||
const float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
|
||||
const int x0_src = (int)floorf(x_src_f);
|
||||
const float dx = x_src_f - (float)x0_src;
|
||||
|
||||
const char * x_base = (const char *)x + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03;
|
||||
|
||||
auto load = [=](int x_off, int y_off) -> float {
|
||||
int i00_src = max(0, min(x0_src + x_off, ne00_src - 1));
|
||||
int i01_src = max(0, min(y0_src + y_off, ne01_src - 1));
|
||||
return *(const float *)(x_base + (int64_t)i00_src * nb00 + (int64_t)i01_src * nb01);
|
||||
};
|
||||
|
||||
const float result = bicubic(
|
||||
bicubic(load(-1,-1), load(0,-1), load(1,-1), load(2,-1), dx),
|
||||
bicubic(load(-1, 0), load(0, 0), load(1, 0), load(2, 0), dx),
|
||||
bicubic(load(-1, 1), load(0, 1), load(1, 1), load(2, 1), dx),
|
||||
bicubic(load(-1, 2), load(0, 2), load(1, 2), load(2, 2), dx), dy);
|
||||
|
||||
dst[index] = result;
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
@@ -104,6 +168,18 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst,
|
||||
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
}
|
||||
|
||||
static void upscale_f32_bicubic_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset, cudaStream_t stream) {
|
||||
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32_bicubic<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
@@ -121,17 +197,22 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
float pixel_offset = 0.5f;
|
||||
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||
sf0 = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
|
||||
sf1 = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
|
||||
pixel_offset = 0.0f;
|
||||
}
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
float pixel_offset = 0.5f;
|
||||
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||
sf0 = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
|
||||
sf1 = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
|
||||
pixel_offset = 0.0f;
|
||||
}
|
||||
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
sf0, sf1, sf2, sf3, pixel_offset, stream);
|
||||
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
|
||||
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
sf0, sf1, sf2, sf3, pixel_offset, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -564,8 +564,10 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
|
||||
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
|
||||
![[dev->mtl_device name] containsString:@"M5"] &&
|
||||
![[dev->mtl_device name] containsString:@"M6"]) {
|
||||
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
|
||||
![[dev->mtl_device name] containsString:@"M6"] &&
|
||||
![[dev->mtl_device name] containsString:@"A19"] &&
|
||||
![[dev->mtl_device name] containsString:@"A20"]) {
|
||||
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -1036,6 +1036,11 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
nth = std::min(nth, nk0);
|
||||
|
||||
if (nth*nrptg > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
|
||||
nrptg = 1;
|
||||
}
|
||||
|
||||
ggml_metal_kargs_set_rows args = {
|
||||
/*.nk0 =*/ nk0,
|
||||
/*.ne01 =*/ ne01,
|
||||
|
||||
@@ -53,6 +53,37 @@
|
||||
|
||||
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
|
||||
|
||||
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
||||
// Precompute mp (m' in the paper) and L such that division
|
||||
// can be computed using a multiply (high 32b of 64b result)
|
||||
// and a shift:
|
||||
//
|
||||
// n/d = (mulhi(n, mp) + n) >> L;
|
||||
struct fastdiv_vals {
|
||||
uint32_t mp;
|
||||
uint32_t L;
|
||||
uint32_t d;
|
||||
uint32_t pad;
|
||||
};
|
||||
static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
|
||||
|
||||
static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
|
||||
GGML_ASSERT(d_64 != 0);
|
||||
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
uint32_t d = (uint32_t)d_64;
|
||||
|
||||
// compute L = ceil(log2(d));
|
||||
uint32_t L = 0;
|
||||
while (L < 32 && (uint32_t{ 1 } << L) < d) {
|
||||
L++;
|
||||
}
|
||||
|
||||
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
|
||||
// pack divisor as well to reduce error surface
|
||||
return { mp, L, d, 0 };
|
||||
}
|
||||
|
||||
enum GPU_FAMILY {
|
||||
ADRENO,
|
||||
INTEL,
|
||||
@@ -2944,8 +2975,11 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
||||
case GGML_OP_PAD:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UPSCALE: {
|
||||
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
||||
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
|
||||
}
|
||||
case GGML_OP_CONV_2D:
|
||||
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
|
||||
(op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
@@ -4461,6 +4495,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
fastdiv_vals ne11_ = init_fastdiv_values(ne11);
|
||||
fastdiv_vals ne12_ = init_fastdiv_values(ne12);
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
@@ -4471,8 +4508,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
||||
|
||||
@@ -1,5 +1,16 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
// v = { mp, L, d }
|
||||
inline uint fastdiv(uint n, uint4 v) {
|
||||
uint msbs;
|
||||
msbs = mul_hi(n, v.s0);
|
||||
return (msbs + n) >> v.s1;
|
||||
}
|
||||
inline uint fastmod(uint n, uint4 v) {
|
||||
uint q = fastdiv(n, v);
|
||||
return n - q * v.s2;
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_f32_i64(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
@@ -11,8 +22,8 @@ kernel void kernel_set_rows_f32_i64(
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
@@ -33,8 +44,10 @@ kernel void kernel_set_rows_f32_i64(
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
//int i12 = i03%ne12;
|
||||
//int i11 = i02%ne11;
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
@@ -58,8 +71,8 @@ kernel void kernel_set_rows_f16_i64(
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
@@ -80,8 +93,10 @@ kernel void kernel_set_rows_f16_i64(
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
//int i12 = i03%ne12;
|
||||
//int i11 = i02%ne11;
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
@@ -105,8 +120,8 @@ kernel void kernel_set_rows_f32_i32(
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
@@ -127,8 +142,10 @@ kernel void kernel_set_rows_f32_i32(
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
//int i12 = i03%ne12;
|
||||
//int i11 = i02%ne11;
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
@@ -152,8 +169,8 @@ kernel void kernel_set_rows_f16_i32(
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
uint4 ne11,
|
||||
uint4 ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
@@ -174,8 +191,10 @@ kernel void kernel_set_rows_f16_i32(
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
//int i12 = i03%ne12;
|
||||
//int i11 = i02%ne11;
|
||||
int i12 = fastmod(i03, ne12);
|
||||
int i11 = fastmod(i02, ne11);
|
||||
|
||||
int i10 = i01;
|
||||
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
@@ -620,7 +620,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_add_id_f32;
|
||||
|
||||
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
||||
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32;
|
||||
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32;
|
||||
vk_pipeline pipeline_scale_f32;
|
||||
vk_pipeline pipeline_sqr_f32;
|
||||
vk_pipeline pipeline_sqrt_f32;
|
||||
@@ -2220,9 +2220,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
}
|
||||
buf->memory_property_flags = req_flags;
|
||||
|
||||
bool done = false;
|
||||
|
||||
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
||||
try {
|
||||
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
||||
done = true;
|
||||
break;
|
||||
} catch (const vk::SystemError& e) {
|
||||
// loop and retry
|
||||
@@ -2233,6 +2236,10 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!buf->device_memory) {
|
||||
@@ -3695,6 +3702,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_bicubic_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BICUBIC}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -6823,7 +6831,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
|
||||
vk_buffer d_B = d_D;
|
||||
size_t b_buf_offset = 0;
|
||||
uint64_t b_sz = 0;
|
||||
uint64_t b_sz = 1;
|
||||
|
||||
if (enable_bias) {
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
||||
@@ -6957,7 +6965,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
|
||||
vk_buffer d_B = d_D;
|
||||
size_t b_buf_offset = 0;
|
||||
uint64_t b_sz = 0;
|
||||
uint64_t b_sz = 1;
|
||||
|
||||
if (enable_bias) {
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
||||
@@ -7093,7 +7101,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
|
||||
vk_buffer d_B = d_D;
|
||||
size_t b_buf_offset = 0;
|
||||
uint64_t b_sz = 0;
|
||||
uint64_t b_sz = 1;
|
||||
|
||||
if (enable_bias) {
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
||||
@@ -7668,7 +7676,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
|
||||
vk_buffer d_B = d_D;
|
||||
size_t b_buf_offset = 0;
|
||||
uint64_t b_sz = 0;
|
||||
uint64_t b_sz = 1;
|
||||
|
||||
if (enable_bias || enable_scale) {
|
||||
const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1];
|
||||
@@ -8193,6 +8201,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_upscale_nearest_f32;
|
||||
case GGML_SCALE_MODE_BILINEAR:
|
||||
return ctx->device->pipeline_upscale_bilinear_f32;
|
||||
case GGML_SCALE_MODE_BICUBIC:
|
||||
return ctx->device->pipeline_upscale_bicubic_f32;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
|
||||
#define NEAREST 0
|
||||
#define BILINEAR 1
|
||||
#define BICUBIC 2
|
||||
|
||||
layout (constant_id = 0) const uint scale_mode = 0;
|
||||
|
||||
@@ -61,6 +62,39 @@ float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
|
||||
return fetch_bilinear(c0, c1, d, i12, i13);
|
||||
}
|
||||
|
||||
// Bicubic interpolation with alpha = -0.75
|
||||
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||
const vec4 bcoeffs1 = vec4( 1.25, -2.25, 0.0, 1.0);
|
||||
const vec4 bcoeffs2 = vec4(-0.75, 3.75, -6.0, 3.0);
|
||||
vec4 powers(float x) { return vec4(x*x*x, x*x, x, 1); }
|
||||
|
||||
float bicubic(float p0, float p1, float p2, float p3, float x) {
|
||||
return p0 * dot(bcoeffs2, powers(x + 1)) +
|
||||
p1 * dot(bcoeffs1, powers(x )) +
|
||||
p2 * dot(bcoeffs1, powers(1 - x)) +
|
||||
p3 * dot(bcoeffs2, powers(2 - x));
|
||||
}
|
||||
|
||||
#define FETCH(a,b) data_a[base + clamp(i.x+(a), 0, res.x) * p.nb00 + clamp(i.y+(b), 0, res.y) * p.nb01]
|
||||
|
||||
float interpolate_bicubic(uint i10, uint i11, uint i12, uint i13) {
|
||||
const ivec2 res = ivec2(p.ne00 - 1, p.ne01 - 1);
|
||||
|
||||
const vec2 coord = (vec2(i10, i11) + p.pixel_offset) / vec2(p.sf0, p.sf1) - p.pixel_offset;
|
||||
const vec2 d = fract(coord);
|
||||
const ivec2 i = ivec2(floor(coord));
|
||||
|
||||
const uint i02 = uint(i12 / p.sf2);
|
||||
const uint i03 = uint(i13 / p.sf3);
|
||||
const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;
|
||||
|
||||
return bicubic(
|
||||
bicubic(FETCH(-1,-1), FETCH(0,-1), FETCH(1,-1), FETCH(2,-1), d.x),
|
||||
bicubic(FETCH(-1, 0), FETCH(0, 0), FETCH(1, 0), FETCH(2, 0), d.x),
|
||||
bicubic(FETCH(-1, 1), FETCH(0, 1), FETCH(1, 1), FETCH(2, 1), d.x),
|
||||
bicubic(FETCH(-1, 2), FETCH(0, 2), FETCH(1, 2), FETCH(2, 2), d.x), d.y);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
@@ -81,6 +115,9 @@ void main() {
|
||||
case BILINEAR:
|
||||
result = interpolate_bilinear(i10, i11, i12, i13);
|
||||
break;
|
||||
case BICUBIC:
|
||||
result = interpolate_bicubic(i10, i11, i12, i13);
|
||||
break;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(result);
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include <algorithm>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <filesystem>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define NOMINMAX
|
||||
@@ -1080,6 +1081,11 @@ int main(int argc, char** argv) {
|
||||
|
||||
if (args.find("--glslc") != args.end()) {
|
||||
GLSLC = args["--glslc"]; // Path to glslc
|
||||
|
||||
if (!std::filesystem::exists(GLSLC) || !std::filesystem::is_regular_file(GLSLC)) {
|
||||
std::cerr << "Error: glslc not found at " << GLSLC << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
if (args.find("--source") != args.end()) {
|
||||
input_filepath = args["--source"]; // The shader source file to compile
|
||||
|
||||
@@ -48,13 +48,18 @@ class LazyMeta(ABCMeta):
|
||||
# NOTE: doing this from a metaclass is very convenient
|
||||
# TODO: make this even more comprehensive
|
||||
for binary_op in (
|
||||
"lt", "le", "eq", "ne", "ge", "gt", "not"
|
||||
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
|
||||
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
|
||||
"lt", "le", "eq", "ne", "ge", "gt",
|
||||
"add", "and", "floordiv", "lshift", "mod", "mul", "matmul",
|
||||
"or", "pow", "rshift", "sub", "truediv", "xor",
|
||||
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
|
||||
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
|
||||
):
|
||||
attr_name = f"__{binary_op}__"
|
||||
# evaluation on the meta tensor is needed in case there's broadcasting
|
||||
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
|
||||
|
||||
for unary_op in ("not", "abs", "invert", "neg", "pos"):
|
||||
attr_name = f"__{unary_op}__"
|
||||
# the result of these operators usually has the same shape and dtype as the input,
|
||||
# so evaluation on the meta tensor can be skipped.
|
||||
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
|
||||
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
||||
@@ -177,6 +179,10 @@ class SafetensorRemote:
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}")
|
||||
|
||||
# order by name (same as default safetensors behavior)
|
||||
# ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
|
||||
res = dict(sorted(res.items(), key=lambda t: t[0]))
|
||||
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
@@ -266,3 +272,77 @@ class SafetensorRemote:
|
||||
if os.environ.get("HF_TOKEN"):
|
||||
headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
|
||||
return headers
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalTensorRange:
|
||||
filename: Path
|
||||
offset: int
|
||||
size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalTensor:
|
||||
dtype: str
|
||||
shape: tuple[int, ...]
|
||||
data_range: LocalTensorRange
|
||||
|
||||
def mmap_bytes(self) -> np.ndarray:
|
||||
return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size)
|
||||
|
||||
|
||||
class SafetensorsLocal:
|
||||
"""
|
||||
Read a safetensors file from the local filesystem.
|
||||
|
||||
Custom parsing gives a bit more control over the memory usage.
|
||||
The official safetensors library doesn't expose file ranges.
|
||||
"""
|
||||
ALIGNMENT = 8 # bytes
|
||||
|
||||
tensors: dict[str, LocalTensor]
|
||||
|
||||
def __init__(self, filename: Path):
|
||||
with open(filename, "rb") as f:
|
||||
metadata_length = int.from_bytes(f.read(8), byteorder='little')
|
||||
file_size = os.stat(filename).st_size
|
||||
if file_size < 8 + metadata_length:
|
||||
raise ValueError(f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {file_size}")
|
||||
|
||||
metadata_str = f.read(metadata_length).decode('utf-8')
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
|
||||
|
||||
data_start_offset = f.tell()
|
||||
alignment = self.ALIGNMENT
|
||||
if data_start_offset % alignment != 0:
|
||||
data_start_offset += alignment - (data_start_offset % alignment)
|
||||
|
||||
tensors: dict[str, LocalTensor] = {}
|
||||
for name, meta in metadata.items():
|
||||
if name == "__metadata__":
|
||||
# ignore metadata, it's not a tensor
|
||||
continue
|
||||
|
||||
tensors[name] = LocalTensor(
|
||||
dtype=meta["dtype"],
|
||||
shape=tuple(meta["shape"]),
|
||||
data_range=LocalTensorRange(
|
||||
filename,
|
||||
data_start_offset + meta["data_offsets"][0],
|
||||
meta["data_offsets"][1] - meta["data_offsets"][0],
|
||||
),
|
||||
)
|
||||
|
||||
# order by name (same as default safetensors behavior)
|
||||
# ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
|
||||
self.tensors = dict(sorted(tensors.items(), key=lambda t: t[0]))
|
||||
|
||||
def __enter__(self, *args, **kwargs):
|
||||
del args, kwargs # unused
|
||||
return self.tensors
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
del args, kwargs # unused
|
||||
|
||||
@@ -132,6 +132,11 @@ add_library(llama
|
||||
models/graph-context-mamba.cpp
|
||||
)
|
||||
|
||||
set_target_properties(llama PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
)
|
||||
|
||||
target_include_directories(llama PRIVATE .)
|
||||
target_include_directories(llama PUBLIC ../include)
|
||||
target_compile_features (llama PRIVATE cxx_std_17) # don't bump
|
||||
|
||||
@@ -151,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
// models like Mamba or RWKV can't have a state partially erased
|
||||
// models like Mamba or RWKV can't have a state partially erased at the end
|
||||
// of the sequence because their state isn't preserved for previous tokens
|
||||
if (seq_id >= (int64_t) size) {
|
||||
// could be fatal
|
||||
return false;
|
||||
@@ -160,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
int32_t & tail_id = cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
const auto & cell = cells[tail_id];
|
||||
// partial intersection is invalid
|
||||
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
||||
// partial intersection is invalid if it includes the final pos
|
||||
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
@@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
|
||||
}
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
@@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
@@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
||||
}
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -272,6 +272,10 @@ static double mean_abs_asymm(const float * a, const float * b, const size_t n, c
|
||||
|
||||
// utils for printing the variables of the test cases
|
||||
|
||||
static std::string var_to_str(const std::string & x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static std::string var_to_str(const T & x) {
|
||||
return std::to_string(x);
|
||||
@@ -323,7 +327,8 @@ static std::string var_to_str(ggml_scale_mode mode) {
|
||||
switch (mode) {
|
||||
case GGML_SCALE_MODE_NEAREST: return "nearest";
|
||||
case GGML_SCALE_MODE_BILINEAR: return "bilinear";
|
||||
default: return std::to_string(mode);
|
||||
case GGML_SCALE_MODE_BICUBIC: return "bicubic";
|
||||
default: return std::to_string(mode);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5218,7 +5223,9 @@ struct test_interpolate : public test_case {
|
||||
const uint32_t mode = GGML_SCALE_MODE_NEAREST;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne, ne_tgt, mode);
|
||||
ggml_scale_mode mode = (ggml_scale_mode)(this->mode & 0xFF);
|
||||
std::string flags = (this->mode & GGML_SCALE_FLAG_ALIGN_CORNERS) ? "align_corners" : "none";
|
||||
return VARS_TO_STR5(type, ne, ne_tgt, mode, flags);
|
||||
}
|
||||
|
||||
test_interpolate(ggml_type type = GGML_TYPE_F32,
|
||||
@@ -7288,15 +7295,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
|
||||
}
|
||||
|
||||
for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
|
||||
for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5, 7, 11}, mode));
|
||||
}
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
for (ggml_scale_mode mode : {GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, mode | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, mode | GGML_SCALE_FLAG_ALIGN_CORNERS));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_sum());
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
|
||||
@@ -23,7 +23,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
int is_pp_shared = params.is_pp_shared;
|
||||
int is_pp_shared = params.is_pp_shared;
|
||||
int is_tg_separate = params.is_tg_separate;
|
||||
|
||||
std::vector<int> n_pp = params.n_pp;
|
||||
std::vector<int> n_tg = params.n_tg;
|
||||
@@ -72,8 +73,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch, bool synchronize) {
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
@@ -113,7 +114,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!params.batched_bench_output_jsonl) {
|
||||
LOG("\n");
|
||||
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, is_tg_separate = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), is_pp_shared, is_tg_separate, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG("\n");
|
||||
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
@@ -172,16 +173,35 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
|
||||
for (int i = 0; i < tg; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
if (is_tg_separate) {
|
||||
// decode pattern:
|
||||
// 0 0 0 ... 1 1 1 ... 2 2 2 ... 3 3 3 ...
|
||||
for (int j = 0; j < pl; ++j) {
|
||||
common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
|
||||
}
|
||||
for (int i = 0; i < tg; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// decode pattern:
|
||||
// 0123 0123 0123 ...
|
||||
for (int i = 0; i < tg; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int j = 0; j < pl; ++j) {
|
||||
common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+5
-1
@@ -354,7 +354,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
|
||||
if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) {
|
||||
LOG_INF("%s: unable to resuse common prefix\n", __func__);
|
||||
n_matching_session_tokens = 0;
|
||||
llama_memory_seq_rm(mem, -1, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
|
||||
@@ -13,6 +13,11 @@ add_library(mtmd
|
||||
mtmd-helper.h
|
||||
)
|
||||
|
||||
set_target_properties(mtmd PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
)
|
||||
|
||||
target_link_libraries (mtmd PUBLIC ggml llama)
|
||||
target_link_libraries (mtmd PRIVATE Threads::Threads)
|
||||
target_include_directories(mtmd PUBLIC .)
|
||||
|
||||
+10
-7
@@ -160,13 +160,13 @@ enum patch_merge_type {
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size;
|
||||
int32_t patch_size;
|
||||
int32_t n_embd;
|
||||
int32_t n_ff;
|
||||
int32_t projection_dim;
|
||||
int32_t n_head;
|
||||
int32_t n_layer;
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
int32_t n_embd = 0;
|
||||
int32_t n_ff = 0;
|
||||
int32_t projection_dim = 0;
|
||||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
@@ -2683,6 +2683,9 @@ struct clip_model_loader {
|
||||
}
|
||||
} else if (is_audio) {
|
||||
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
||||
// some hparams are unused, but still need to set to avoid issues
|
||||
hparams.image_size = 0;
|
||||
hparams.patch_size = 1;
|
||||
|
||||
} else {
|
||||
GGML_ASSERT(false && "unknown modality");
|
||||
|
||||
@@ -182,7 +182,7 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(lctx);
|
||||
int n_mmproj_embd = llama_model_n_embd(model);
|
||||
int n_mmproj_embd = llama_model_n_embd_inp(model);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
||||
|
||||
@@ -2,3 +2,7 @@ set(TARGET rpc-server)
|
||||
add_executable(${TARGET} rpc-server.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user