Compare commits

...

16 Commits

Author SHA1 Message Date
Jeff Bolz 10a0351a97 vulkan: add RTE variants for glu/add/sub/mul/div (#14653) 2025-07-15 21:32:11 +02:00
Shunta Saito 68e37a61a7 model : add PLaMo-2 support (#14560)
* Add PLaMo-2 model using hybrid memory module

* Fix z shape

* Add cmath to include from llama-vocab.h

* Explicitly dequantize normalization weights before RoPE apply

* Revert unnecessary cast because the problem can be solved by excluding attn_k, attn_q when quantizing

* Use ATTN_K/Q_NORM for k,q weights to prevent quantization

* Remove SSM_BCDT that is not used from anywhere

* Do not duplicate embedding weights for output.weight

* Fix tokenizer encoding problem for multibyte strings

* Apply suggestion from @CISC

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Use LLM_FFN_SWIGLU instead of splitting ffn_gate and ffn_up

* Remove unnecessary part for Grouped Query Attention

* Fix how to load special token id to gguf

* Remove unused tensor mapping

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Remove llama_vocab_plamo2 class and replace it with llm_tokenizer_plamo2_session to follow the other tokenizer implementations

* Update src/llama-vocab.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Fix plamo2 tokenizer session to prevent multiple calls of build()

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-07-15 18:11:42 +02:00
R0CKSTAR cbc68be51d cuda: fix build warnings in set-rows.cu (unused variable) (#14687)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
2025-07-15 15:28:53 +08:00
Anton Mitkov bdca38376f sycl: Hotfix for non dnnl codepath (#14677) 2025-07-14 18:12:42 +01:00
shalinib-ibm 55c509daf5 ggml : refactor llamafile_sgemm PPC code (#14673)
Remove un-necessary templates from class definition and packing functions
Reduce deeply nested conditionals, if-else switching in mnapck function
Replace repetitive code with inline functions in Packing functions

2 ~ 7% improvement in Q8 Model
15 ~ 50% improvement in Q4 Model

Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
2025-07-14 16:16:42 +03:00
Aman Gupta 9c9e4fc635 llama-context: add ability to get logits (#14672) 2025-07-14 21:01:41 +08:00
Johannes Gäßler 494c5899cb scripts: benchmark for HTTP server throughput (#14668)
* scripts: benchmark for HTTP server throughput

* fix server connection reset
2025-07-14 13:14:30 +02:00
Akarshan Biswas 0f4c6ec0f1 SYCL: use 1D kernel for set_rows (#14618)
* SYCL: Use 1D kernel for set_rows

* Remove dangling comment

* Refactor and use ceil_div
2025-07-14 10:37:55 +01:00
Anton Mitkov 65a3ebb0aa sycl: Batched mulmat rework for oneDNN dispatch (#14617) 2025-07-14 10:37:35 +01:00
Molly Sophia 0d9226763c llama : add jinja template for rwkv-world (#14665)
* llama : add jinja template for rwkv-world

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-07-14 07:43:43 +08:00
Ed Addario 982e347255 quantize : fix minor logic flaw in --tensor-type (#14572) 2025-07-13 18:02:17 +02:00
Sigbjørn Skjæret 923e3ea2e3 cuda : add set rows for bf16 (#14664) 2025-07-13 15:01:24 +02:00
Yavor Ivanov e743cddb60 cuda : add ELU support (#14657) 2025-07-13 11:33:16 +02:00
Georgi Gerganov 05fec5bd29 ggml : add build-time message to remind about ggml_set_rows (#14661)
ggml-ci
2025-07-13 10:36:33 +03:00
Yavor Ivanov dcf7f2ea3c metal : Add missing unary ops Metal support (#14660) 2025-07-13 08:38:13 +03:00
Yavor Ivanov 84b396e051 cmake : Add CMake presets for Linux and GCC (#14656) 2025-07-13 08:12:36 +03:00
37 changed files with 2119 additions and 1286 deletions
+11
View File
@@ -55,6 +55,17 @@
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
}
},
{
"name": "x64-linux-gcc", "hidden": true,
"cacheVariables": {
"CMAKE_C_COMPILER": "gcc",
"CMAKE_CXX_COMPILER": "g++"
}
},
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
+177 -1
View File
@@ -1082,7 +1082,14 @@ class TextModel(ModelBase):
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.chat_template = "rwkv-world"
if special_vocab.chat_template is None:
template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
if template_path.is_file():
with open(template_path, "r", encoding="utf-8") as f:
template = f.read()
else:
template = "rwkv-world"
special_vocab.chat_template = template
# hack: Add '\n\n' as the EOT token to make it chat normally
special_vocab._set_special_token("eot", 261)
# hack: Override these as they have already been set (incorrectly)
@@ -3501,6 +3508,175 @@ class PlamoModel(TextModel):
return [(new_name, data_torch)]
@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
class Plamo2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PLAMO2
def set_vocab(self):
# PLaMo 2 uses a custom tokenizer with a .jsonl file
# We need to handle this specially
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
if not tokenizer_jsonl_path.is_file():
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
# Load tokenizer config
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
tokenizer_config = json.load(f)
# Load tokens from JSONL file (actually a list format)
tokens = []
scores = []
toktypes = []
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f):
if line.strip():
token_data = json.loads(line)
# Format: [token, score, type, ?, ?, ?, ?]
token = token_data[0].encode("utf-8")
score = float(token_data[1])
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
tokens.append(token)
scores.append(score)
# Map token type strings to GGUF token types
if token_type_str == "UNKNOWN":
toktypes.append(gguf.TokenType.UNKNOWN)
elif token_type_str == "CONTROL":
toktypes.append(gguf.TokenType.CONTROL)
elif token_type_str == "BYTE":
toktypes.append(gguf.TokenType.BYTE)
else:
# Check for PLaMo-2 special tokens
token_str = token_data[0]
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)
vocab_size = self.hparams["vocab_size"]
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNUSED)
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
self.gguf_writer.add_tokenizer_model("plamo2")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
# Add special tokens from config
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
self.gguf_writer.add_bos_token_id(token_id)
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
self.gguf_writer.add_eos_token_id(token_id)
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
self.gguf_writer.add_pad_token_id(token_id)
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
self.gguf_writer.add_sep_token_id(token_id)
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
self.gguf_writer.add_unk_token_id(token_id)
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
self.gguf_writer.add_eot_token_id(4)
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
# Which layers are Mamba layers
# PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
# This logic matches modeling_plamo.py's is_mamba function
mamba_step = hparams.get("mamba_step", 2)
mamba_enabled = hparams.get("mamba_enabled", True)
mamba_layers = []
if mamba_enabled:
for i in range(block_count):
if block_count <= (mamba_step // 2):
# use attention in last layer
is_mamba = (i != block_count - 1)
else:
is_mamba = (i % mamba_step) != (mamba_step // 2)
if is_mamba:
mamba_layers.append(0)
else:
mamba_layers.append(hparams.get("num_key_value_heads", 4))
if mamba_layers:
self.gguf_writer.add_head_count_kv(mamba_layers)
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
# Mamba parameters
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
self.gguf_writer.add_ssm_inner_size(intermediate_size)
self.gguf_writer.add_ssm_group_count(0)
# MLP feed forward parameters (for attention layers)
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)
elif name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
elif name.endswith(".dt_norm_weight"):
name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
elif name.endswith(".B_norm_weight"):
name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
elif name.endswith(".C_norm_weight"):
name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
elif name.endswith(".k_weight"):
name = name.rpartition(".k_weight")[0] + ".k.weight"
elif name.endswith(".q_weight"):
name = name.rpartition(".q_weight")[0] + ".q.weight"
elif name.endswith(".conv1d.weight"):
data_torch = torch.squeeze(data_torch) # remove (, 1, )
assert data_torch.ndim == 2
elif name.endswith(".pre_mixer_norm.weight"):
data_torch += 1.0
elif name.endswith(".post_mixer_norm.weight"):
data_torch += 1.0 / 5
elif name.endswith(".pre_mlp_norm.weight"):
data_torch += 1.0
elif name.endswith(".post_mlp_norm.weight"):
data_torch += 1.0 / (5**1.5)
elif name.endswith(".norm.weight"):
data_torch += 1.0
new_name = self.map_tensor_name(name)
return [(new_name, data_torch)]
@ModelBase.register("CodeShellForCausalLM")
class CodeShellModel(TextModel):
model_arch = gguf.MODEL_ARCH.CODESHELL
+1
View File
@@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
{
// TODO: add support
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
return false;
} break;
case GGML_OP_CPY: {
File diff suppressed because it is too large Load Diff
+6 -1
View File
@@ -2303,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_UNARY_OP_EXP:
ggml_cuda_op_exp(ctx, dst);
break;
case GGML_UNARY_OP_ELU:
ggml_cuda_op_elu(ctx, dst);
break;
default:
return false;
}
@@ -3116,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_ELU:
return ggml_is_contiguous(op->src[0]);
default:
return false;
@@ -3222,7 +3226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
} break;
case GGML_OP_SET_ROWS:
{
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_I64;
} break;
+22 -1
View File
@@ -3,13 +3,21 @@
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
template<typename src_t, typename dst_t>
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
GGML_UNUSED(src_f);
GGML_UNUSED(dst_f);
}
template<>
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
*dst_h = __float2half(*src_f);
}
template<>
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
*dst_b = *src_f;
}
template<>
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
*dst_f = *src_f;
@@ -48,6 +56,9 @@ static __global__ void k_set_rows(
const src_t* src_elem = src0_row + i00;
dst_t* dst_elem = dst_row_ptr + i00;
set_rows_1(src_elem, dst_elem);
GGML_UNUSED(ne10);
GGML_UNUSED(ne13);
}
template<typename src_t, typename dst_t>
@@ -124,6 +135,16 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
nb1, nb2, nb3,
stream
);
} else if (dst->type == GGML_TYPE_BF16) {
set_rows_cuda(
src0_d, src1_d, (nv_bfloat16*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else {
GGML_ABORT("unsupported type");
}
+7
View File
@@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
return logf(x);
}
static __device__ __forceinline__ float op_elu(float x) {
return (x > 0.f) ? x : expm1f(x);
}
template <float (*op)(float), typename T>
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_log>(ctx, dst);
}
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_elu>(ctx, dst);
}
/* gated ops */
template <float (*op)(float), typename T>
+2
View File
@@ -59,6 +59,8 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+90
View File
@@ -173,6 +173,12 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_SILU,
GGML_METAL_KERNEL_TYPE_SILU_4,
GGML_METAL_KERNEL_TYPE_ELU,
GGML_METAL_KERNEL_TYPE_ABS,
GGML_METAL_KERNEL_TYPE_SGN,
GGML_METAL_KERNEL_TYPE_STEP,
GGML_METAL_KERNEL_TYPE_HARDSWISH,
GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
GGML_METAL_KERNEL_TYPE_EXP,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
@@ -1155,6 +1161,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU, elu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS, abs, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN, sgn, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP, step, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH, hardswish, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID, hardsigmoid, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP, exp, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction);
@@ -1688,6 +1700,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_SGN:
case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_EXP:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
default:
return false;
@@ -2439,6 +2457,78 @@ static bool ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_ABS:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_SGN:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_STEP:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_HARDSWISH:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_HARDSIGMOID:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_EXP:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
default:
{
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+45
View File
@@ -1199,6 +1199,51 @@ kernel void kernel_neg(
dst[tpig] = -src0[tpig];
}
kernel void kernel_abs(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = fabs(src0[tpig]);
}
kernel void kernel_sgn(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
device const float & x = src0[tpig];
dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f);
}
kernel void kernel_step(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f;
}
kernel void kernel_hardswish(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
device const float & x = src0[tpig];
dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
}
kernel void kernel_hardsigmoid(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
device const float & x = src0[tpig];
dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
}
kernel void kernel_exp(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = exp(src0[tpig]);
}
kernel void kernel_reglu(
device const char * src0,
device const char * src1,
+1
View File
@@ -2280,6 +2280,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
{
// TODO: add support
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
if (op->src[0]->type != GGML_TYPE_F32) {
return false;
}
+15 -27
View File
@@ -32,39 +32,28 @@ public:
else static_assert(0);
}
// matrix A has m rows, k columns
// matrix B has k rows, n columns
// nra - number of elements to skip when moving into next row in A
// nrb - number of elements to skip when moving into next row in B
// nca - number of elements to skip when moving into next column in A
// ncb - number of elements to skip when moving into next column in B
// stride_a - number of elements to skip when moving to next A matrix
// stride_b - number of elements to skip when moving to next B matrix
// batches_a - number of A matrices
// batches_b - number of B matrices
static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2,
const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2,
void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
auto stream = ctx.stream_dnnl(q);
auto eng = ctx.engine_dnnl(q);
// { # strides, # rows, # columns }
dnnl::memory::dims a_dims = { batches_a, m, k };
dnnl::memory::dims b_dims = { batches_b, k, n };
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
// { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
dnnl::memory::dims a_strides = { stride_a, nra, nca };
dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
dnnl::memory::dims a_dims = {batches_a, m, k };
dnnl::memory::dims a_strides = {stra2, stra1, stra0};
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::abc);
dnnl::memory::dims b_dims = {batches_b, k, n };
dnnl::memory::dims b_strides = {strb2, strb0, strb1};
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n};
dnnl::memory::dims c_strides = {m*n, 1, m };
const auto c_md = dnnl::memory::desc(c_dims, ct, c_strides);
dnnl::primitive_attr primitive_attr;
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
#ifdef GGML_SYCL_F16
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
#endif
@@ -76,24 +65,23 @@ public:
auto scratchpad_md = matmul_pd.scratchpad_desc();
auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q);
auto matmul_prim = dnnl::matmul(matmul_pd);
std::unordered_map<int, dnnl::memory> matmul_args;
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
matmul_args.insert({ DNNL_ARG_DST, c_mem });
matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem });
matmul_prim.execute(stream, matmul_args);
}
// matrices A and B are column major, both having k rows
// matrix A has m column, matrix B has n columns
// output: column major matrix C = A transposed * B
static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
}
};
+125 -43
View File
@@ -1546,7 +1546,7 @@ static void mul_mat_p021_f16_f32(
static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor,
const sycl::nd_item<3> &item_ct1) {
const sycl::half *x = (const sycl::half *)vx;
@@ -1557,7 +1557,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
item_ct1.get_local_id(0);
const int channel_x = channel / channel_x_divisor;
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
@@ -1576,7 +1575,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
const int row_y = col_x;
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
const int iy = channel*nrows_y + row_y;
const int iy = channel * channel_stride_y + row_y;
const float xi =
sycl::vec<sycl::half, 1>(x[ix])
@@ -1823,7 +1822,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
static void ggml_mul_mat_vec_nc_f16_f32_sycl(
const void *vx, const float *y, float *dst, const int ncols_x,
const int nrows_x, const int row_stride_x, const int nchannels_x,
const int nchannels_y, const int channel_stride_x, queue_ptr stream) {
const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) {
const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -1835,7 +1834,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
row_stride_x, channel_stride_x,
row_stride_x, channel_stride_x, channel_stride_y,
nchannels_y / nchannels_x, item_ct1);
});
}
@@ -2124,8 +2123,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr,
DnnlGemmWrapper::to_dt<sycl::half>(), src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
}
else
@@ -2171,8 +2170,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
}
else
@@ -2776,6 +2775,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
const int64_t nb02 = src0->nb[2];
const int64_t ne12 = src1->ne[2];
const int64_t nb11 = src1->nb[1];
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
queue_ptr main_stream = ctx.stream();
@@ -2786,8 +2786,9 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
const int64_t channel_stride_y = nb11 / sizeof(float);
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream);
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2841,8 +2842,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
float * dst_ddf = static_cast<float *>(dst->data);
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
const size_t type_size_src0 = ggml_type_size(src0->type);
const size_t type_size_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == type_size_src1);
// SRC1 strides
int64_t s11 = nb11 / type_size_src1;
@@ -2854,11 +2855,40 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
if (src1->type != GGML_TYPE_F16) {
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
" : converting src1 to fp16");
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
// iterate tensor dims and find the slowest moving dim and stride
int64_t last_dim=0;
int64_t last_str=0;
int64_t largest_str=0;
for(int i = 0; i< 4; i++){
// last stride is always the largest
if(src1->nb[i] == largest_str){
if(src1->ne[last_dim] == 1){
last_str = i;
last_dim = i;
}
}
if(src1->nb[i] > largest_str){
largest_str = src1->nb[i];
last_str = i;
last_dim = i;
}
}
#if GGML_SYCL_DNNL
// oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
src1_f16_alloc.alloc(ne_src1);
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
GGML_ASSERT(to_fp16_sycl != nullptr);
to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue);
# else
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
#endif
src1_f16 = src1_f16_alloc.get();
s11 = ne10;
@@ -2892,38 +2922,89 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
(const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
int64_t str_a0 = nb00 / type_size_src0;
int64_t str_a1 = nb01 / type_size_src0;
int64_t str_a2 = nb02 / type_size_src0;
DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
};
int64_t str_b0 = nb10 / type_size_src1;
int64_t str_b1 = nb11 / type_size_src1;
int64_t str_b2 = nb12 / type_size_src1;
if (r2 == 1 && r3 == 1) {
if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
}
else {
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
const sycl::half *src1, float *dst,
int64_t a0, int64_t a1, int64_t batcha,
int64_t b0, int64_t b1, int64_t batchb,
int64_t sa0, int64_t sa1, int64_t sa2,
int64_t sb0, int64_t sb1, int64_t sb2,
int64_t sd2) {
bool supported_broadcast = batchb == batcha ? true
: batchb == 1 || batcha == 1 ? true
: false;
if (supported_broadcast) {
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0,
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2, src1,
DnnlGemmWrapper::to_dt<sycl::half>(), sb0, sb1, sb2, dst,
DnnlGemmWrapper::to_dt<float>(), queue, batcha, batchb);
} else {
// iterate over batches from smaller set of matrices (matrix 0)
int64_t batches0 = batcha;
int64_t batches1 = batchb;
if (batches0 > batches1) {
int64_t num_mul_mats = batches1;
int64_t sub_batch = batches0 / num_mul_mats;
// src0 is batched and bigger, shift and multiply with src1
for (int64_t i0 = 0; i0 < num_mul_mats; i0++) {
const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch);
const sycl::half *src1_shifted = src1 + (sb2 * i0);
float *dst_shifted = dst + (sd2 * i0 * sub_batch);
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
queue, sub_batch, 1);
}
} else {
int64_t num_mul_mats = batches0;
int64_t sub_batch = batches1 / num_mul_mats;
// src1 is batched and bigger, shift and multiply with src0
for (int64_t i1 = 0; i1 < num_mul_mats; i1++) {
const sycl::half *src0_shifted = src0 + (sa2 * i1);
const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch);
float *dst_shifted = dst + (sd2 * i1 * sub_batch);
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
queue, 1, sub_batch);
}
}
}
};
bool cont_batches_a = nb02 * ne02 == nb03;
bool cont_batches_b = nb12 * ne12 == nb13;
if (cont_batches_a && cont_batches_b) {
int64_t batches0 = ne02 * ne03;
int64_t batches1 = ne12 * ne13;
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
str_b2, nb2 / sizeof(float));
} else {
for (int64_t b_a = 0; b_a < ne03; b_a++) {
const sycl::half *src0_f16_shifted
= src0_f16 + (nb03 * b_a / type_size_src0);
const sycl::half *src1_f16_shifted
= src1_f16 + (nb13 * b_a / type_size_src1);
float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float));
int64_t batches0 = ne02;
int64_t batches1 = ne12;
launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted,
ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1,
str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float));
}
}
} else {
// iterate over batches from smaller set of matrices (matrix 0)
for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
}
}
}
}
else
#endif
@@ -3263,10 +3344,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
}
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
@@ -4303,6 +4384,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
{
// TODO: add support
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
} break;
case GGML_OP_CPY:
+41 -41
View File
@@ -6,46 +6,49 @@ static constexpr bool is_arithmetic_v() {
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
}
}
template<typename TIn, typename TOut>
static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
convert (const char* src, char* dst) {
auto src_val = *reinterpret_cast<const TIn*>(src);
auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
*reinterpret_cast<TOut*>(dst) = dst_val;;
*reinterpret_cast<TOut*>(dst) = dst_val;
}
template<typename TIn, typename TOut>
static void k_set_rows(
const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t ne11, const int64_t ne12,
const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
const size_t src_type_size, const size_t dst_type_size,
const sycl::nd_item<3> & item_ct1) {
const int64_t total_elements,
const sycl::nd_item<1> & item_ct1) {
const int i03 = item_ct1.get_group(0);
const int i02 = item_ct1.get_group(1);
const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index
if (i01 >= ne01) {
const int64_t i = item_ct1.get_global_linear_id();
if (i >= total_elements) {
return;
}
const int i12 = i03 % ne12;
const int i11 = i02 % ne11;
const int i10 = i01;
const int64_t i03 = i / (ne00 * ne01 * ne02);
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
const int64_t i12 = i03 % ne12;
const int64_t i11 = i02 % ne11;
const int64_t i10 = i01;
const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
const char * src_elem = src0_row + i00 * src_type_size;
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
char * dst_elem = dst_row_ptr + i00 * dst_type_size;
for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
const char * src_elem = src0_row + col * src_type_size;
char * dst_elem = dst_row_ptr + col * dst_type_size;
convert<TIn, TOut>(src_elem, dst_elem);
}
convert<TIn, TOut>(src_elem, dst_elem);
}
template<typename TIn, typename TOut>
@@ -58,33 +61,30 @@ static void set_rows_sycl(
const size_t src_type_size, const size_t dst_type_size,
queue_ptr stream) {
constexpr int max_threads_per_row = 64; // KEEPING 64 for now
const int threads_per_row = std::min((int)ne00, max_threads_per_row);
const int64_t total_elements = ne00 * ne01 * ne02 * ne03;
constexpr int max_threads_per_block = 64;
const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row);
constexpr int block_size = 64;
const int64_t grid_size = ceil_div(total_elements, block_size);
const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
sycl_parallel_for(
stream,
sycl::nd_range<3>(grid_size * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) {
k_set_rows<TIn, TOut>(
src0_d, src1_d, dst_d,
ne00, ne01, ne11, ne12,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
src_type_size, dst_type_size,
item_ct1
);
}
);
sycl_parallel_for(
stream,
sycl::nd_range<1>(grid_size * block_size, block_size),
[=](sycl::nd_item<1> item_ct1) {
k_set_rows<TIn, TOut>(
src0_d, src1_d, dst_d,
ne00, ne01, ne02,
ne11, ne12,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
src_type_size, dst_type_size,
total_elements,
item_ct1
);
}
);
}
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * src0 = dst->src[0];
@@ -122,7 +122,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
nb1, nb2, nb3,
sizeof(float), sizeof(sycl::half),
stream
);
);
break;
default:
GGML_ABORT("Unsupported tensor type!");
+9 -3
View File
@@ -2835,10 +2835,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
return s;
};
bool rte = device->float_controls_rte_fp16;
#define CREATE_BINARY(name, namemod, spec) \
for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \
ggml_vk_create_pipeline(device, device->pipeline_ ## name ## namemod[s0][s1][d], \
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d][rte], name ## _data[s0][s1][d][rte], \
"main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1);
CREATE_BINARY(add, , {0})
@@ -2890,8 +2891,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
#undef CREATE_UNARY
#define CREATE_GLU(name) \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);
if (device->float_controls_rte_fp16) { \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
} else { \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
}
CREATE_GLU(geglu)
CREATE_GLU(reglu)
@@ -1,10 +1,6 @@
#version 450
#if RTE16
#extension GL_EXT_spirv_intrinsics : enable
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
#endif // RTE16
#include "rte.comp"
#include "types.comp"
#if defined(SET_ROWS) && QUANT_K == 1
@@ -1,6 +1,8 @@
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_control_flow_attributes : require
#include "rte.comp"
layout (push_constant) uniform parameter
{
uint ne;
@@ -1,5 +1,7 @@
#extension GL_EXT_shader_16bit_storage : require
#include "rte.comp"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -1,12 +1,9 @@
#version 450
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_spirv_intrinsics: enable
#extension GL_EXT_control_flow_attributes : require
#if RTE16
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
#endif
#include "rte.comp"
layout (push_constant) uniform parameter
{
@@ -1,11 +1,8 @@
#include "types.comp"
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_spirv_intrinsics: enable
#if RTE16
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
#endif
#include "rte.comp"
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
@@ -0,0 +1,5 @@
#if RTE16
#extension GL_EXT_spirv_intrinsics : enable
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
#endif // RTE16
@@ -537,8 +537,10 @@ void process_shaders() {
for (auto src0_f16 : {false, true}) {
for (auto src1_f16 : {false, true}) {
for (auto dst_f16 : {false, true}) {
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16);
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}});
for (auto rte : {false, true}) {
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : "");
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
}
}
}
}
@@ -592,16 +594,19 @@ void process_shaders() {
string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("geglu_erf_f16", "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("geglu_erf_f32", "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
for (auto rte : {false, true}) {
std::string suffix = rte ? "_rte" : "";
string_to_spv("geglu_f16" + suffix, "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("geglu_f32" + suffix, "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("reglu_f16" + suffix, "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("reglu_f32" + suffix, "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("swiglu_f16" + suffix, "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("swiglu_f32" + suffix, "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("geglu_erf_f16" + suffix, "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("geglu_erf_f32" + suffix, "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("geglu_quick_f16" + suffix,"geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
string_to_spv("geglu_quick_f32" + suffix,"geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
}
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
@@ -709,11 +714,59 @@ void write_output_files() {
std::remove(path.c_str());
}
}
std::string suffixes[2] = {"_f32", "_f16"};
for (const char *op : {"add", "sub", "mul", "div"}) {
fprintf(hdr, "extern unsigned char *%s_data[2][2][2];\n", op);
fprintf(hdr, "extern uint64_t %s_len[2][2][2];\n", op);
fprintf(src, "unsigned char *%s_data[2][2][2] = {{{%s_f32_f32_f32_data, %s_f32_f32_f16_data}, {%s_f32_f16_f32_data, %s_f32_f16_f16_data}}, {{%s_f16_f32_f32_data, %s_f16_f32_f16_data}, {%s_f16_f16_f32_data, %s_f16_f16_f16_data}}};\n", op, op, op, op, op, op, op, op, op);
fprintf(src, "uint64_t %s_len[2][2][2] = {{{%s_f32_f32_f32_len, %s_f32_f32_f16_len}, {%s_f32_f16_f32_len, %s_f32_f16_f16_len}}, {{%s_f16_f32_f32_len, %s_f16_f32_f16_len}, {%s_f16_f16_f32_len, %s_f16_f16_f16_len}}};\n", op, op, op, op, op, op, op, op, op);
fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op);
fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op);
std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = ";
std::string len = "uint64_t " + std::string(op) + "_len[2][2][2][2] = ";
for (uint32_t t0 = 0; t0 < 2; ++t0) {
if (t0 == 0) {
data += "{";
len += "{";
}
for (uint32_t t1 = 0; t1 < 2; ++t1) {
if (t1 == 0) {
data += "{";
len += "{";
}
for (uint32_t t2 = 0; t2 < 2; ++t2) {
if (t2 == 0) {
data += "{";
len += "{";
}
for (uint32_t rte = 0; rte < 2; ++rte) {
if (rte == 0) {
data += "{";
len += "{";
}
data += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : "");
len += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : "");
data += "_data,";
len += "_len,";
if (rte == 1) {
data += "}, ";
len += "}, ";
}
}
if (t2 == 1) {
data += "}, ";
len += "}, ";
}
}
if (t1 == 1) {
data += "}, ";
len += "}, ";
}
}
if (t0 == 1) {
data += "};\n";
len += "};\n";
}
}
fprintf(src, data.c_str());
fprintf(src, len.c_str());
}
fclose(hdr);
fclose(src);
+32
View File
@@ -317,6 +317,7 @@ class MODEL_ARCH(IntEnum):
PHI3 = auto()
PHIMOE = auto()
PLAMO = auto()
PLAMO2 = auto()
CODESHELL = auto()
ORION = auto()
INTERNLM2 = auto()
@@ -631,6 +632,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PHIMOE: "phimoe",
MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.PLAMO2: "plamo2",
MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2",
@@ -1369,6 +1371,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.PLAMO2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_POST_NORM,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_X,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.SSM_DT_NORM,
MODEL_TENSOR.SSM_B_NORM,
MODEL_TENSOR.SSM_C_NORM,
],
MODEL_ARCH.GPT2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.POS_EMBD,
+52 -30
View File
@@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 granite-hybrid
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
"tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon
@@ -63,7 +63,7 @@ class TensorNameMap:
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe plamo2
"output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
@@ -77,7 +77,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe plamo2
"norm", # llama-pth
"transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2
@@ -126,6 +126,7 @@ class TensorNameMap:
"h.{bid}.ln_1", # gpt2
"transformer.h.{bid}.ln", # phi2
"model.layers.layers.{bid}.norm", # plamo
"model.layers.layers.{bid}.pre_mixer_norm", # plamo2
"model.layers.{bid}.attention_norm", # internlm2
"model.layers.{bid}.norm", # mamba-qbert
"backbone.layers.{bid}.norm", # mamba
@@ -163,6 +164,7 @@ class TensorNameMap:
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
"encoder.layers.{bid}.mixer.Wqkv", # jina
"model.layers.{bid}.self_attn.qkv_proj", # phi3
"model.layers.layers.{bid}.mixer.qkv_proj", # plamo2
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
"transformer.layers.{bid}.attn.qkv_proj", # openelm
"transformer_encoder.{bid}.qkv", # neobert
@@ -233,6 +235,7 @@ class TensorNameMap:
"h.{bid}.attn.c_proj", # gpt2
"transformer.h.{bid}.mixer.out_proj", # phi2
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
"model.layers.layers.{bid}.mixer.o_proj", # plamo2
"model.layers.{bid}.attention.wo", # internlm2
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
"encoder.layers.{bid}.mixer.out_proj", # jina
@@ -255,8 +258,9 @@ class TensorNameMap:
),
MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
"model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
),
# Rotary embeddings
@@ -286,6 +290,7 @@ class TensorNameMap:
"model.layers.{bid}.pre_moe_layernorm", # mini-jamba
"model.layers.{bid}.post_attention_layernorm", # llama4
"transformer_encoder.{bid}.ffn_norm", # neobert
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
),
# Post feed-forward norm
@@ -298,6 +303,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
"model.layers.{bid}.feed_forward.up_proj",
),
@@ -342,6 +348,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
"model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.layers.{bid}.mlp.gate_up_proj", # plamo2
"model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
@@ -469,6 +476,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm
"model.layers.layers.{bid}.mixer.q", # plamo2
),
MODEL_TENSOR.ATTN_K_NORM: (
@@ -479,6 +487,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm
"model.layers.layers.{bid}.mixer.k", # plamo2
),
MODEL_TENSOR.ROPE_FREQS: (
@@ -559,27 +568,31 @@ class TensorNameMap:
),
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", # mamba-hf
"backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.in_proj", # mamba-hf
"backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
),
MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d", # mamba-hf
"backbone.layers.{bid}.mixer.conv1d", # mamba
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.conv1d", # mamba-hf
"backbone.layers.{bid}.mixer.conv1d", # mamba
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.conv1d", # plamo2
),
MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj", # mamba-hf
"backbone.layers.{bid}.mixer.x_proj", # mamba
"model.layers.{bid}.mamba.x_proj", # jamba
"model.layers.{bid}.x_proj", # mamba-hf
"backbone.layers.{bid}.mixer.x_proj", # mamba
"model.layers.{bid}.mamba.x_proj", # jamba
"model.layers.layers.{bid}.mixer.bcdt_proj", # plamo2
),
MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj", # mamba-hf
"backbone.layers.{bid}.mixer.dt_proj", # mamba
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.dt_proj", # mamba-hf
"backbone.layers.{bid}.mixer.dt_proj", # mamba
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
),
MODEL_TENSOR.SSM_DT_NORM: (
@@ -587,25 +600,33 @@ class TensorNameMap:
),
MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log", # mamba-hf
"backbone.layers.{bid}.mixer.A_log", # mamba
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.A_log", # mamba-hf
"backbone.layers.{bid}.mixer.A_log", # mamba
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.A_log", # plamo2
),
MODEL_TENSOR.SSM_B_NORM: (
"model.layers.{bid}.mamba.b_layernorm", # jamba
"model.layers.{bid}.mamba.B_layernorm", # mini-jamba
"model.layers.{bid}.mamba.b_layernorm", # jamba
"model.layers.{bid}.mamba.B_layernorm", # mini-jamba
"model.layers.layers.{bid}.mixer.B_norm.weight", # plamo2
),
MODEL_TENSOR.SSM_C_NORM: (
"model.layers.{bid}.mamba.c_layernorm", # jamba
"model.layers.{bid}.mamba.C_layernorm", # mini-jamba
"model.layers.{bid}.mamba.c_layernorm", # jamba
"model.layers.{bid}.mamba.C_layernorm", # mini-jamba
"model.layers.layers.{bid}.mixer.C_norm.weight", # plamo2
),
MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D", # mamba-hf
"backbone.layers.{bid}.mixer.D", # mamba
"model.layers.{bid}.mamba.D", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.D", # mamba-hf
"backbone.layers.{bid}.mixer.D", # mamba
"model.layers.{bid}.mamba.D", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.D", # plamo2
),
MODEL_TENSOR.SSM_DT_NORM: (
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
),
MODEL_TENSOR.SSM_NORM: (
@@ -614,9 +635,10 @@ class TensorNameMap:
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", # mamba-hf
"backbone.layers.{bid}.mixer.out_proj", # mamba
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.out_proj", # mamba-hf
"backbone.layers.{bid}.mixer.out_proj", # mamba
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.out_proj", # plamo2
),
MODEL_TENSOR.TIME_MIX_W0: (
+7 -6
View File
@@ -71,12 +71,13 @@ extern "C" {
typedef int32_t llama_seq_id;
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
};
enum llama_rope_type {
@@ -0,0 +1,34 @@
{%- if not add_generation_prompt is defined -%}
{%- set add_generation_prompt = true -%}
{%- endif -%}
{%- set ns = namespace(system_prompt='') -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set ns.system_prompt = message['content'] -%}
{%- endif -%}
{%- endfor -%}
{{bos_token}}
{%- if ns.system_prompt != '' -%}
{{- 'System: ' + ns.system_prompt + '\n\n' -}}
{%- endif -%}
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- 'User: ' + message['content']|trim + '\n\n' -}}
{%- endif -%}
{%- if message['role'] == 'assistant' and message['content'] is not none -%}
{%- set content = message['content'] -%}
{%- if '</think>' in content -%}
{%- set content = content.split('</think>')[-1] -%}
{%- endif -%}
{{- 'Assistant: ' + content|trim + '\n\n' -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- 'Assistant:' -}}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- ' <think>\n</think>' }}
{%- endif %}
{%- if enable_thinking is defined and enable_thinking is true %}
{{- ' <think>' }}
{%- endif %}
{%- endif -%}
+1
View File
@@ -3,6 +3,7 @@
-r ../tools/server/tests/requirements.txt
-r ./requirements-compare-llama-bench.txt
-r ./requirements-server-bench.txt
-r ./requirements-pydantic.txt
-r ./requirements-test-tokenizer-random.txt
@@ -0,0 +1,5 @@
datasets~=3.2.0
matplotlib~=3.10.0
numpy~=1.26.4
requests~=2.32.3
tqdm~=4.67.1
+210
View File
@@ -0,0 +1,210 @@
#!/usr/bin/env python3
import argparse
import json
import subprocess
from time import sleep, time
from typing import Optional
import datasets
import logging
import matplotlib.pyplot as plt
import numpy as np
import requests
from tqdm.contrib.concurrent import thread_map
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger("server-bench")
def get_prompts(n_prompts: int) -> list[str]:
logger.info("Loading MMLU dataset...")
ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"] # type: ignore
if n_prompts >= 0:
ret = ret[:n_prompts]
return ret
def get_server(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int) -> dict:
logger.info("Starting the llama.cpp server...")
address = f"http://localhost:{port}"
popen_args: list[str] = [
path_server,
"--flash-attn",
"--n-gpu-layers", str(n_gpu_layers),
"--parallel", str(parallel),
"--ctx-size", str(parallel * ctx_size),
"--model", path_model,
"--port", str(port),
"--swa-full", # FIXME performance bad otherwise
# "--attn-streams",
]
fout = open("bench.log", "w") if path_log is not None else subprocess.DEVNULL
process = subprocess.Popen(popen_args, stdout=fout, stderr=subprocess.STDOUT)
n_failures: int = 0
while True:
try:
sleep(1.0)
exit_code = process.poll()
if exit_code is not None:
raise RuntimeError(f"llama.cpp server for {path_model} exited unexpectedly with exit code {exit_code}")
response = requests.get(f"{address}/health")
if response.status_code == 200:
break
except requests.ConnectionError:
n_failures += 1
if n_failures >= 10:
raise RuntimeError(f"llama.cpp server for {path_model} is not healthy after 10 seconds")
return {"process": process, "address": address, "fout": fout}
def get_prompt_length(data: dict) -> int:
session = data["session"]
server_address: str = data["server_address"]
response = session.post(
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
prompt: str = json.loads(response.text)["prompt"]
response = session.post(
f"{server_address}/tokenize",
json={"content": prompt, "add_special": True}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
tokens: list[str] = json.loads(response.text)["tokens"]
return len(tokens)
def send_prompt(data: dict) -> tuple[float, list[float]]:
session = data["session"]
server_address: str = data["server_address"]
response = session.post(
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
prompt: str = json.loads(response.text)["prompt"]
json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
response = session.post(f"{server_address}/completion", json=json_data, stream=True)
last_valid_line: str = ""
token_arrival_times: list[float] = []
for line in response.iter_lines(decode_unicode=True):
if not line.startswith("data: "):
continue
last_valid_line = line
token_arrival_times.append(time())
token_arrival_times = token_arrival_times[:-1]
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
timings: dict = json.loads(last_valid_line[6:])["timings"]
return (timings["prompt_ms"], token_arrival_times)
def benchmark(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int, n_prompts: int, n_predict: int):
num_workers: int = parallel + 1
prompts: list[str] = get_prompts(n_prompts)
server: Optional[dict] = None
session = None
try:
server = get_server(path_server, path_model, path_log, port, n_gpu_layers, parallel, ctx_size)
server_address: str = server["address"]
adapter = requests.adapters.HTTPAdapter(pool_connections=num_workers, pool_maxsize=num_workers) # type: ignore
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
data: list[dict] = []
for i, p in enumerate(prompts):
data.append({"session": session, "server_address": server_address, "prompt": p, "n_predict": n_predict, "seed": i})
logger.info("Getting the prompt lengths...")
prompt_n = [get_prompt_length(d) for d in data]
logger.info("Starting the benchmark...\n")
t0 = time()
results: list[tuple[int, list[float]]] = thread_map(send_prompt, data, max_workers=num_workers, chunksize=1)
finally:
if server is not None:
server["process"].terminate()
server["process"].wait()
if session is not None:
session.close()
prompt_ms = []
token_t = []
depth_sum: int = 0
for pn, (pms, tat) in zip(prompt_n, results):
prompt_ms.append(pms)
token_t += tat
n_tokens: int = len(tat)
depth_sum += n_tokens * pn
depth_sum += n_tokens * (n_tokens + 1) // 2
prompt_n = np.array(prompt_n, dtype=np.int64)
prompt_ms = np.array(prompt_ms, dtype=np.float64)
token_t = np.array(token_t, dtype=np.float64)
token_t -= t0
token_t_last = np.max(token_t)
logger.info("")
logger.info(f"Benchmark duration: {token_t_last:.2f} s")
logger.info(f"Request throughput: {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
logger.info(f"Total prompt length: {np.sum(prompt_n)} tokens")
logger.info(f"Average prompt length: {np.mean(prompt_n):.2f} tokens")
logger.info(f"Average prompt latency: {np.mean(prompt_ms):.2f} ms")
logger.info(f"Average prompt speed: {np.sum(prompt_n) / (1e-3 * np.sum(prompt_ms)):.2f} tokens/s")
logger.info(f"Total generated tokens: {token_t.shape[0]}")
logger.info(f"Average generation depth: {depth_sum / token_t.shape[0]:.2f} tokens")
logger.info(f"Average total generation speed: {token_t.shape[0] / token_t_last:.2f} tokens/s")
logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
plt.figure()
plt.scatter(prompt_n, prompt_ms, s=10.0, marker=".", alpha=0.25)
plt.xlim(0, 1.05 * np.max(prompt_n))
plt.ylim(0, 1.05 * np.max(prompt_ms))
plt.title(path_model)
plt.xlabel("Prompt length [tokens]")
plt.ylabel("Time to first token [ms]")
plt.savefig("prompt_time.png", dpi=240)
bin_max = np.ceil(token_t_last) + 1
plt.figure()
plt.hist(token_t, np.arange(0, bin_max))
plt.xlim(0, bin_max + 1)
plt.title(path_model)
plt.xlabel("Time [s]")
plt.ylabel("Num. tokens generated per second")
plt.savefig("gen_rate.png", dpi=240)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
"Results are printed to console and visualized as plots (saved to current working directory).")
parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
parser.add_argument("--path_model", type=str, required=True, help="Path to the model to use for the benchmark")
parser.add_argument("--path_log", type=str, default=None, help="Path to the model to use for the benchmark")
parser.add_argument("--port", type=int, default=18725, help="Port to use for the server during the benchmark")
parser.add_argument("--n_gpu_layers", type=int, default=999, help="Number of GPU layers for the server")
parser.add_argument("--parallel", type=int, default=16, help="Number of slots for the server")
parser.add_argument("--ctx_size", type=int, default=4096, help="Server context size per slot")
parser.add_argument("--n_prompts", type=int, default=1000, help="Number of prompts to evaluate")
parser.add_argument("--n_predict", type=int, default=2048, help="Max. number of tokens to predict per prompt")
args = parser.parse_args()
benchmark(**vars(args))
+32
View File
@@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_PLAMO2, "plamo2" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" },
@@ -784,6 +785,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_PLAMO2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_CODESHELL,
{
@@ -2094,6 +2125,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_JAMBA:
case LLM_ARCH_FALCON_H1:
case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2:
return true;
+1
View File
@@ -38,6 +38,7 @@ enum llm_arch {
LLM_ARCH_PHI3,
LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO,
LLM_ARCH_PLAMO2,
LLM_ARCH_CODESHELL,
LLM_ARCH_ORION,
LLM_ARCH_INTERNLM2,
+1 -1
View File
@@ -170,7 +170,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
return LLM_CHAT_TEMPLATE_EXAONE_3;
} else if (tmpl_contains("rwkv-world")) {
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
} else if (tmpl_contains("<|start_of_role|>")) {
return LLM_CHAT_TEMPLATE_GRANITE;
+13 -2
View File
@@ -731,7 +731,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int32_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
@@ -791,10 +792,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
auto * t_logits = res->get_logits();
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
// extract logits
if (logits && t_logits) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
}
// extract embeddings
if (t_embd) {
if (embd && t_embd) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
+414
View File
@@ -935,6 +935,33 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_PLAMO2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
// Load Mamba SSM parameters
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
}
switch (hparams.n_layer) {
case 16: type = LLM_TYPE_1B; break;
case 32:
if (hparams.n_embd == 2048) {
type = LLM_TYPE_2B;
} else if (hparams.n_embd == 4096) {
type = LLM_TYPE_8B;
}
break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GPT2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2938,6 +2965,73 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_PLAMO2:
{
const uint32_t d_conv = hparams.ssm_d_conv;
const uint32_t d_state = hparams.ssm_d_state;
const uint32_t num_heads = hparams.ssm_dt_rank;
const uint32_t intermediate_size = hparams.ssm_d_inner;
const uint32_t head_dim = intermediate_size / num_heads;
const uint32_t qk_dim = head_dim;
const uint32_t v_dim = head_dim;
const int64_t num_attention_heads = hparams.n_head();
const int64_t q_num_heads = num_attention_heads;
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
bool is_mamba_layer = hparams.is_recurrent(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
if (is_mamba_layer) {
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
} else {
const int64_t num_key_value_heads = hparams.n_head_kv(i);
const int64_t k_num_heads = num_key_value_heads;
const int64_t v_num_heads = num_key_value_heads;
const int64_t q_proj_dim = q_num_heads * qk_dim;
const int64_t k_proj_dim = k_num_heads * qk_dim;
const int64_t v_proj_dim = v_num_heads * v_dim;
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
}
// All layers have post-attention norm, FFN norm, and FFN tensors
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
}
} break;
case LLM_ARCH_GPT2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5209,6 +5303,7 @@ void llama_model::print_info() const {
arch == LLM_ARCH_MAMBA2 ||
arch == LLM_ARCH_JAMBA ||
arch == LLM_ARCH_FALCON_H1 ||
arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -15476,6 +15571,320 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
}
};
struct llm_build_plamo2 : public llm_graph_context_mamba {
llm_build_plamo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
// {n_embd, n_tokens}
inpL = build_inp_embd(model.tok_embd);
cb(inpL, "embedding_output", -1);
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_hybrid = build_inp_mem_hybrid();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * residual = inpL;
// ggml_graph_add_node(gf, model.layers[il].attn_norm);
// cb(model.layers[il].attn_norm, "attn_norm", il);
// pre_mixer_norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// check if this layer is Mamba or Attention
bool is_mamba_layer = hparams.is_recurrent(il);
if (is_mamba_layer) {
// PLaMo-2 Mamba layer
cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
} else {
// PLaMo-2 Attention layer
cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, gf, cur, model, il);
}
// post_mixer_norm
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
// residual connection
cur = ggml_add(ctx0, cur, residual);
cb(cur, "attn_residual", il);
residual = cur;
// pre-ffn norm
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_pre_norm", il);
// feed-forward network
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
// post ffn norm
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_post_norm", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
}
// residual connection
cur = ggml_add(ctx0, cur, residual);
cb(cur, "ffn_residual", il);
inpL = cur;
}
cur = inpL;
// final norm
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
// Explicitly mark as output tensor to ensure proper backend assignment
ggml_set_output(cur);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
private:
ggml_tensor * build_plamo2_attn_layer(
llm_graph_input_attn_kv_unified * inp,
ggml_tensor * inp_pos,
ggml_cgraph * gf,
ggml_tensor * cur,
const llama_model & model,
int il) {
// self-attention
{
// PLaMo-2 uses combined QKV tensor
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
cb(qkv, "qkv", il);
// split QKV tensor into Q, K, V
const int64_t n_embd_head_q = hparams.n_embd_head_k;
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
int32_t n_head_kv = hparams.n_head_kv(il);
const int64_t q_offset = 0;
const int64_t k_offset = n_embd_head_q * n_head;
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cur = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
}
cb(cur, "attn_out", il);
return cur;
}
ggml_tensor * build_plamo2_mamba_layer(
llm_graph_input_rs * inp,
ggml_cgraph * gf,
ggml_tensor * cur,
const llama_model & model,
const llama_ubatch & ubatch,
int il) {
const auto * mctx_cur = inp->mctx;
const auto kv_head = mctx_cur->get_head();
const int64_t d_conv = hparams.ssm_d_conv;
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
const int64_t n_heads = hparams.ssm_dt_rank;
const int64_t head_dim = d_inner / n_heads;
const int64_t n_group = hparams.ssm_n_group;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs);
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
// in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
cb(zx, "mamba_in_proj", il);
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
cb(zx, "mamba_in_proj_out", il);
// split into z and x
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
x = ggml_cont(ctx0, x);
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
cb(x, "mamba_x_split", il);
ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
cb(z, "mamba_z_split", il);
// conv1d
{
// => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
x = ggml_view_2d(ctx0, x, d_inner, n_seq_tokens * n_seqs, d_inner * x->nb[0], 0);
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
cb(conv_x, "mamba_conv1d_input", il);
// copy last (d_conv - 1) columns back into the state cache
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, last_conv,
ggml_view_1d(ctx0, conv_states_all,
(d_conv - 1)*(d_inner)*(n_seqs),
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
// 1D convolution
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
cb(x, "mamba_conv1d", il);
x = ggml_silu(ctx0, x);
cb(x, "mamba_conv1d_silu", il);
}
// SSM
{
// bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
cb(x_bcdt, "mamba_bcdt_proj", il);
// split into dt, B, C
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
cb(B, "mamba_B_raw", il);
cb(C, "mamba_C_raw", il);
cb(dt, "mamba_dt_raw", il);
// Apply RMS norm to dt, B, C (PLaMo-2 specific)
B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
cb(B, "mamba_B_normed", il);
cb(C, "mamba_C_normed", il);
cb(dt, "mamba_dt_normed", il);
// dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
dt = build_lora_mm(model.layers[il].ssm_dt, dt);
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
cb(dt, "mamba_dt_proj", il);
ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
cb(A, "mamba_A", il);
x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
// use the states and the indices provided by build_recurrent_state
// (this is necessary in order to properly use the states before they are overwritten,
// while avoiding to make unnecessary copies of the states)
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
// Custom operator to optimize the parallel associative scan
// as described in the Annex D of the Mamba paper.
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
};
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
cb(y_ssm, "mamba_ssm_scan", il);
// store last states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0,
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
cb(y, "mamba_y_view", il);
// Add D parameter and apply gating with z
// {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
cb(y, "mamba_y_add_d", il);
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
cb(y, "mamba_y_swiglu_z", il);
// out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
cur = build_lora_mm(model.layers[il].ssm_out, y);
cb(cur, "mamba_out_proj", il);
}
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
cb(cur, "mamba_out", il);
return cur;
}
};
struct llm_build_arcee : public llm_graph_context {
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -16262,6 +16671,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_plamo>(*this, params, gf);
} break;
case LLM_ARCH_PLAMO2:
{
llm = std::make_unique<llm_build_plamo2>(*this, params, gf);
} break;
case LLM_ARCH_GPT2:
{
llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
@@ -16651,6 +17064,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_PHI3:
case LLM_ARCH_PHIMOE:
case LLM_ARCH_PLAMO:
case LLM_ARCH_PLAMO2:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
case LLM_ARCH_GEMMA3:
+1 -2
View File
@@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
new_type = qtype;
break; // if two or more types are specified for the tensor, first match wins
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
}
}
}
+341 -8
View File
@@ -11,6 +11,7 @@
#include <cassert>
#include <cctype>
#include <cfloat>
#include <cmath>
#include <cstdarg>
#include <cstring>
#include <forward_list>
@@ -1196,6 +1197,284 @@ private:
const llm_tokenizer_rwkv & tokenizer;
};
struct llm_tokenizer_plamo2 : llm_tokenizer {
llm_tokenizer_plamo2(const llama_vocab & vocab) {
build(vocab);
}
void build(const llama_vocab & vocab) {
// Reset internal structures
tokens_.clear();
bytes_.assign(256, 0);
to_suffix_id_.clear();
table_.clear();
// Build token list and byte mapping
std::unordered_map<std::string, float> suffix_to_score;
std::unordered_map<std::string, llama_token> token_to_id;
for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
const auto & entry = vocab.get_token_data(token_id);
tokens_.push_back(entry.text);
token_to_id[entry.text] = static_cast<llama_token>(token_id);
// Handle byte tokens
if (vocab.is_byte(token_id)) {
if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
std::string hex_str = entry.text.substr(3, 2);
int byte_val = std::stoi(hex_str, nullptr, 16);
bytes_[byte_val] = static_cast<llama_token>(token_id);
}
continue;
}
// Add token and all its suffixes to suffix_to_score
suffix_to_score[entry.text] = entry.score;
// Extract suffixes character by character (UTF-8 aware)
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
for (size_t i = 1; i < cpts.size(); ++i) {
std::string suffix;
for (size_t j = i; j < cpts.size(); ++j) {
suffix += unicode_cpt_to_utf8(cpts[j]);
}
if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
}
}
}
// Check that all byte tokens are set
for (int i = 0; i < 256; ++i) {
if (bytes_[i] == 0) {
throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
}
}
// Build suffix list in lexicographical order of reversed strings
std::vector<std::string> suffixes;
for (const auto & pair : suffix_to_score) {
suffixes.push_back(pair.first);
}
suffixes.push_back(""); // Empty suffix
std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
std::string rev_a(a.rbegin(), a.rend());
std::string rev_b(b.rbegin(), b.rend());
return rev_a < rev_b;
});
// Build suffix_to_id and to_suffix_id_
std::unordered_map<std::string, int32_t> suffix_to_id;
int32_t num_pieces = 0;
for (const auto & suffix : suffixes) {
suffix_to_id[suffix] = num_pieces;
if (!suffix.empty()) {
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
std::string remaining;
for (size_t i = 1; i < cpts.size(); ++i) {
remaining += unicode_cpt_to_utf8(cpts[i]);
}
int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
to_suffix_id_[piece_code] = num_pieces;
// Count number of pieces for this suffix
int32_t pieces_for_suffix = 1; // sentinel row
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
std::string piece;
for (int32_t i = 0; i < piece_length; ++i) {
piece += unicode_cpt_to_utf8(cpts[i]);
}
if (suffix_to_score.find(piece) != suffix_to_score.end()) {
pieces_for_suffix++;
}
}
num_pieces += pieces_for_suffix;
} else {
num_pieces++; // Empty suffix contributes one piece (sentinel row)
}
}
// Build flattened table
table_.resize(num_pieces, std::vector<int32_t>(4, 0));
int32_t table_idx = 0;
for (const auto & suffix : suffixes) {
// Add all prefixes of the suffix to the table (in decreasing order of length)
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
std::string piece;
for (int32_t i = 0; i < piece_length; ++i) {
piece += unicode_cpt_to_utf8(cpts[i]);
}
auto score_it = suffix_to_score.find(piece);
if (score_it == suffix_to_score.end()) {
continue;
}
table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
auto token_it = token_to_id.find(piece);
table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
float score = score_it->second;
table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
table_idx++;
}
// Add sentinel row
table_[table_idx][TABLE_PIECE_LENGTH] = 1;
table_[table_idx][TABLE_TOKEN_ID] = -1;
table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
table_idx++;
}
}
std::vector<llama_token> encode(const std::string & text) const {
std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
// Skip the first code point if it is a BOM (Byte Order Mark)
if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
unicode_data.erase(unicode_data.begin());
}
if (unicode_data.empty()) {
return {};
}
const size_t data_len = unicode_data.size();
// Initialize scores array (dynamic programming)
std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
scores[data_len] = 0;
// Path array to track best tokenization
std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
int32_t suffix_id = 0;
// Process from end to beginning
for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
uint32_t c = unicode_data[i];
// Find next suffix ID
for (size_t p = suffix_id; p < table_.size(); ++p) {
int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
auto it = to_suffix_id_.find(piece_code);
suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
break;
}
}
// Update best path
for (size_t p = suffix_id; p < table_.size(); ++p) {
int32_t score = table_[p][TABLE_SCORE];
if (score > INVALID_SCORE) {
int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
int64_t s = scores[i + piece_length] - score;
if (s < scores[i]) {
scores[i] = s;
path[i][PATH_TOKEN_LENGTH] = piece_length;
path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
if (score == UNKNOWN_SCORE) {
// Add UTF-8 byte count
path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
}
}
}
if (score == UNKNOWN_SCORE) {
break;
}
}
}
// Decode the best path
std::vector<llama_token> token_ids;
token_ids.reserve(path[0][PATH_NUM_TOKENS]);
int pos = 0;
while (pos < static_cast<int>(data_len)) {
if (path[pos][PATH_TOKEN_ID] >= 0) {
token_ids.push_back(path[pos][PATH_TOKEN_ID]);
} else {
// Fall back to byte tokens
uint32_t c = unicode_data[pos];
int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
for (int i = 0; i < s; ++i) {
uint8_t b;
if (s == 1) {
b = c;
} else {
if (i == 0) {
b = (0xF00 >> s) & 0xFF;
} else {
b = 0x80;
}
}
token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
}
}
assert(path[pos][PATH_TOKEN_LENGTH] > 0);
pos += path[pos][PATH_TOKEN_LENGTH];
}
return token_ids;
}
private:
// Constants for table structure
static constexpr int32_t TABLE_PIECE_LENGTH = 0;
static constexpr int32_t TABLE_TOKEN_ID = 1;
static constexpr int32_t TABLE_SCORE = 2;
static constexpr int32_t TABLE_PIECE_ID = 3;
// Constants for path array
static constexpr int32_t PATH_TOKEN_LENGTH = 0;
static constexpr int32_t PATH_TOKEN_ID = 1;
static constexpr int32_t PATH_NUM_TOKENS = 2;
// Score constants
static constexpr int32_t INVALID_SCORE = -20000000;
static constexpr int32_t UNKNOWN_SCORE = -10000000;
// List of tokens in the vocabulary
std::vector<std::string> tokens_;
// Mapping from byte code point to token ID (for byte fallback)
std::vector<llama_token> bytes_;
// Mapping from piece code to suffix ID
std::unordered_map<int64_t, int32_t> to_suffix_id_;
// Flattened table representing the Trie structure
// Each row contains: [piece_length, token_id, score, piece_id]
std::vector<std::vector<int32_t>> table_;
};
struct llm_tokenizer_plamo2_session {
llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
void tokenize(const std::string & text, std::vector<llama_token> & output) {
std::vector<llama_token> tokens = tokenizer.encode(text);
output.insert(output.end(), tokens.begin(), tokens.end());
}
private:
const llm_tokenizer_plamo2 & tokenizer;
};
//
// impl
//
@@ -1499,6 +1778,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "plamo2") {
type = LLAMA_VOCAB_TYPE_PLAMO2;
// PLaMo-2 default special tokens (these will be overridden by model config)
special_bos_id = 1; // <|plamo:bos|>
special_eos_id = 2; // <|plamo:eos|>
special_unk_id = 0; // <|plamo:unk|>
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = 3; // <|plamo:pad|>
special_mask_id = LLAMA_TOKEN_NULL;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
@@ -2145,13 +2434,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
std::string llama_vocab::impl::type_name() const{
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
default: return "unknown";
}
}
@@ -2234,6 +2524,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
break;
case LLAMA_VOCAB_TYPE_PLAMO2:
tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
break;
default:
GGML_ABORT("unsupported vocab type");
}
@@ -2566,6 +2859,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
} break;
case LLAMA_VOCAB_TYPE_PLAMO2:
{
llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
@@ -2664,6 +2974,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
memcpy(buf, result.data(), result.size());
return (int)result.size();
}
case LLAMA_VOCAB_TYPE_PLAMO2: {
// PLaMo-2 uses similar token handling as BPE/SPM
if (vocab.is_byte(token)) {
// Handle byte tokens like <0xXX>
if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
if (length < 1) {
return -1;
}
buf[0] = static_cast<char>(hex_val);
return 1;
}
}
// Normal token - just copy the text
std::string result = token_text;
return _try_copy(result.data(), result.size());
}
default:
GGML_ABORT("fatal error");
}
@@ -2908,6 +3236,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
case LLAMA_VOCAB_TYPE_BPE: {
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
}
case LLAMA_VOCAB_TYPE_PLAMO2: {
// PLaMo-2 uses byte tokens in format <0xXX>
char hex_str[8];
snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
return pimpl->token_to_id.at(hex_str);
}
default:
GGML_ABORT("fatal error");
}
@@ -3385,4 +3719,3 @@ int32_t llama_detokenize(
bool unparse_special) {
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
}
+2
View File
@@ -11,6 +11,8 @@
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
// increase backlog size to avoid connection resets for >> 1 slots
#define CPPHTTPLIB_LISTEN_BACKLOG 512
// disable Nagle's algorithm
#define CPPHTTPLIB_TCP_NODELAY true
#include <cpp-httplib/httplib.h>