mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 10a0351a97 | |||
| 68e37a61a7 | |||
| cbc68be51d | |||
| bdca38376f | |||
| 55c509daf5 | |||
| 9c9e4fc635 | |||
| 494c5899cb | |||
| 0f4c6ec0f1 | |||
| 65a3ebb0aa | |||
| 0d9226763c | |||
| 982e347255 | |||
| 923e3ea2e3 | |||
| e743cddb60 | |||
| 05fec5bd29 | |||
| dcf7f2ea3c | |||
| 84b396e051 |
@@ -55,6 +55,17 @@
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "x64-linux-gcc", "hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_COMPILER": "gcc",
|
||||
"CMAKE_CXX_COMPILER": "g++"
|
||||
}
|
||||
},
|
||||
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
|
||||
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
|
||||
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
|
||||
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
|
||||
+177
-1
@@ -1082,7 +1082,14 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.chat_template = "rwkv-world"
|
||||
if special_vocab.chat_template is None:
|
||||
template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
|
||||
if template_path.is_file():
|
||||
with open(template_path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
else:
|
||||
template = "rwkv-world"
|
||||
special_vocab.chat_template = template
|
||||
# hack: Add '\n\n' as the EOT token to make it chat normally
|
||||
special_vocab._set_special_token("eot", 261)
|
||||
# hack: Override these as they have already been set (incorrectly)
|
||||
@@ -3501,6 +3508,175 @@ class PlamoModel(TextModel):
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
|
||||
class Plamo2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLAMO2
|
||||
|
||||
def set_vocab(self):
|
||||
# PLaMo 2 uses a custom tokenizer with a .jsonl file
|
||||
# We need to handle this specially
|
||||
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
|
||||
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
|
||||
|
||||
if not tokenizer_jsonl_path.is_file():
|
||||
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
|
||||
|
||||
# Load tokenizer config
|
||||
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
|
||||
tokenizer_config = json.load(f)
|
||||
|
||||
# Load tokens from JSONL file (actually a list format)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f):
|
||||
if line.strip():
|
||||
token_data = json.loads(line)
|
||||
# Format: [token, score, type, ?, ?, ?, ?]
|
||||
token = token_data[0].encode("utf-8")
|
||||
score = float(token_data[1])
|
||||
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
|
||||
|
||||
tokens.append(token)
|
||||
scores.append(score)
|
||||
|
||||
# Map token type strings to GGUF token types
|
||||
if token_type_str == "UNKNOWN":
|
||||
toktypes.append(gguf.TokenType.UNKNOWN)
|
||||
elif token_type_str == "CONTROL":
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
elif token_type_str == "BYTE":
|
||||
toktypes.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
# Check for PLaMo-2 special tokens
|
||||
token_str = token_data[0]
|
||||
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
|
||||
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
|
||||
self.gguf_writer.add_tokenizer_model("plamo2")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# Add special tokens from config
|
||||
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_bos_token_id(token_id)
|
||||
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_eos_token_id(token_id)
|
||||
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_pad_token_id(token_id)
|
||||
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_sep_token_id(token_id)
|
||||
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
|
||||
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
|
||||
self.gguf_writer.add_unk_token_id(token_id)
|
||||
|
||||
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
|
||||
self.gguf_writer.add_eot_token_id(4)
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
|
||||
# Which layers are Mamba layers
|
||||
# PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
|
||||
# This logic matches modeling_plamo.py's is_mamba function
|
||||
mamba_step = hparams.get("mamba_step", 2)
|
||||
mamba_enabled = hparams.get("mamba_enabled", True)
|
||||
mamba_layers = []
|
||||
|
||||
if mamba_enabled:
|
||||
for i in range(block_count):
|
||||
if block_count <= (mamba_step // 2):
|
||||
# use attention in last layer
|
||||
is_mamba = (i != block_count - 1)
|
||||
else:
|
||||
is_mamba = (i % mamba_step) != (mamba_step // 2)
|
||||
if is_mamba:
|
||||
mamba_layers.append(0)
|
||||
else:
|
||||
mamba_layers.append(hparams.get("num_key_value_heads", 4))
|
||||
|
||||
if mamba_layers:
|
||||
self.gguf_writer.add_head_count_kv(mamba_layers)
|
||||
|
||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
|
||||
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
|
||||
|
||||
# Mamba parameters
|
||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||
self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
|
||||
self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
|
||||
intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
|
||||
self.gguf_writer.add_ssm_inner_size(intermediate_size)
|
||||
self.gguf_writer.add_ssm_group_count(0)
|
||||
|
||||
# MLP feed forward parameters (for attention layers)
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
data_torch = -torch.exp(data_torch)
|
||||
elif name.endswith(".dt_bias"):
|
||||
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
||||
elif name.endswith(".dt_norm_weight"):
|
||||
name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
|
||||
elif name.endswith(".B_norm_weight"):
|
||||
name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
|
||||
elif name.endswith(".C_norm_weight"):
|
||||
name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
|
||||
elif name.endswith(".k_weight"):
|
||||
name = name.rpartition(".k_weight")[0] + ".k.weight"
|
||||
elif name.endswith(".q_weight"):
|
||||
name = name.rpartition(".q_weight")[0] + ".q.weight"
|
||||
elif name.endswith(".conv1d.weight"):
|
||||
data_torch = torch.squeeze(data_torch) # remove (, 1, )
|
||||
assert data_torch.ndim == 2
|
||||
elif name.endswith(".pre_mixer_norm.weight"):
|
||||
data_torch += 1.0
|
||||
elif name.endswith(".post_mixer_norm.weight"):
|
||||
data_torch += 1.0 / 5
|
||||
elif name.endswith(".pre_mlp_norm.weight"):
|
||||
data_torch += 1.0
|
||||
elif name.endswith(".post_mlp_norm.weight"):
|
||||
data_torch += 1.0 / (5**1.5)
|
||||
elif name.endswith(".norm.weight"):
|
||||
data_torch += 1.0
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("CodeShellForCausalLM")
|
||||
class CodeShellModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.CODESHELL
|
||||
|
||||
@@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
|
||||
+340
-1091
File diff suppressed because it is too large
Load Diff
@@ -2303,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_cuda_op_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cuda_op_elu(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3116,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
@@ -3222,7 +3226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
||||
#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 &&
|
||||
op->src[1]->type == GGML_TYPE_I64;
|
||||
} break;
|
||||
|
||||
@@ -3,13 +3,21 @@
|
||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
|
||||
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
||||
GGML_UNUSED(src_f);
|
||||
GGML_UNUSED(dst_f);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
|
||||
*dst_h = __float2half(*src_f);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
|
||||
*dst_b = *src_f;
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
|
||||
*dst_f = *src_f;
|
||||
@@ -48,6 +56,9 @@ static __global__ void k_set_rows(
|
||||
const src_t* src_elem = src0_row + i00;
|
||||
dst_t* dst_elem = dst_row_ptr + i00;
|
||||
set_rows_1(src_elem, dst_elem);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
@@ -124,6 +135,16 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
set_rows_cuda(
|
||||
src0_d, src1_d, (nv_bfloat16*)dst->data,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else {
|
||||
GGML_ABORT("unsupported type");
|
||||
}
|
||||
|
||||
@@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_elu(float x) {
|
||||
return (x > 0.f) ? x : expm1f(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
@@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_log>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_elu>(ctx, dst);
|
||||
}
|
||||
/* gated ops */
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
|
||||
@@ -59,6 +59,8 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -173,6 +173,12 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SILU,
|
||||
GGML_METAL_KERNEL_TYPE_SILU_4,
|
||||
GGML_METAL_KERNEL_TYPE_ELU,
|
||||
GGML_METAL_KERNEL_TYPE_ABS,
|
||||
GGML_METAL_KERNEL_TYPE_SGN,
|
||||
GGML_METAL_KERNEL_TYPE_STEP,
|
||||
GGML_METAL_KERNEL_TYPE_HARDSWISH,
|
||||
GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
|
||||
GGML_METAL_KERNEL_TYPE_EXP,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
|
||||
@@ -1155,6 +1161,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU, elu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS, abs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN, sgn, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP, step, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH, hardswish, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID, hardsigmoid, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP, exp, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction);
|
||||
@@ -1688,6 +1700,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -2439,6 +2457,78 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_ABS:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
||||
|
||||
@@ -1199,6 +1199,51 @@ kernel void kernel_neg(
|
||||
dst[tpig] = -src0[tpig];
|
||||
}
|
||||
|
||||
kernel void kernel_abs(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = fabs(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sgn(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f);
|
||||
}
|
||||
|
||||
kernel void kernel_step(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
kernel void kernel_hardswish(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
kernel void kernel_hardsigmoid(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
kernel void kernel_exp(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = exp(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_reglu(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
||||
@@ -2280,6 +2280,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
+15
-27
@@ -32,39 +32,28 @@ public:
|
||||
else static_assert(0);
|
||||
}
|
||||
|
||||
// matrix A has m rows, k columns
|
||||
// matrix B has k rows, n columns
|
||||
// nra - number of elements to skip when moving into next row in A
|
||||
// nrb - number of elements to skip when moving into next row in B
|
||||
// nca - number of elements to skip when moving into next column in A
|
||||
// ncb - number of elements to skip when moving into next column in B
|
||||
// stride_a - number of elements to skip when moving to next A matrix
|
||||
// stride_b - number of elements to skip when moving to next B matrix
|
||||
// batches_a - number of A matrices
|
||||
// batches_b - number of B matrices
|
||||
static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
|
||||
const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
|
||||
const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
|
||||
const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2,
|
||||
const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2,
|
||||
void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
|
||||
|
||||
auto stream = ctx.stream_dnnl(q);
|
||||
auto eng = ctx.engine_dnnl(q);
|
||||
|
||||
// { # strides, # rows, # columns }
|
||||
dnnl::memory::dims a_dims = { batches_a, m, k };
|
||||
dnnl::memory::dims b_dims = { batches_b, k, n };
|
||||
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
|
||||
|
||||
// { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
|
||||
dnnl::memory::dims a_strides = { stride_a, nra, nca };
|
||||
dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
|
||||
|
||||
dnnl::memory::dims a_dims = {batches_a, m, k };
|
||||
dnnl::memory::dims a_strides = {stra2, stra1, stra0};
|
||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::abc);
|
||||
|
||||
dnnl::memory::dims b_dims = {batches_b, k, n };
|
||||
dnnl::memory::dims b_strides = {strb2, strb0, strb1};
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
|
||||
|
||||
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n};
|
||||
dnnl::memory::dims c_strides = {m*n, 1, m };
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, c_strides);
|
||||
dnnl::primitive_attr primitive_attr;
|
||||
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
|
||||
#endif
|
||||
@@ -76,24 +65,23 @@ public:
|
||||
|
||||
auto scratchpad_md = matmul_pd.scratchpad_desc();
|
||||
auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q);
|
||||
|
||||
auto matmul_prim = dnnl::matmul(matmul_pd);
|
||||
|
||||
std::unordered_map<int, dnnl::memory> matmul_args;
|
||||
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
|
||||
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
|
||||
|
||||
matmul_args.insert({ DNNL_ARG_DST, c_mem });
|
||||
matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem });
|
||||
|
||||
matmul_prim.execute(stream, matmul_args);
|
||||
}
|
||||
|
||||
// matrices A and B are column major, both having k rows
|
||||
// matrix A has m column, matrix B has n columns
|
||||
// output: column major matrix C = A transposed * B
|
||||
static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
|
||||
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
|
||||
|
||||
gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
|
||||
gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1546,7 +1546,7 @@ static void mul_mat_p021_f16_f32(
|
||||
|
||||
static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
||||
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
|
||||
const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const sycl::half *x = (const sycl::half *)vx;
|
||||
@@ -1557,7 +1557,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
item_ct1.get_local_id(0);
|
||||
const int channel_x = channel / channel_x_divisor;
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
@@ -1576,7 +1575,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
const int row_y = col_x;
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
const int iy = channel * channel_stride_y + row_y;
|
||||
|
||||
const float xi =
|
||||
sycl::vec<sycl::half, 1>(x[ix])
|
||||
@@ -1823,7 +1822,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||
static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||
const void *vx, const float *y, float *dst, const int ncols_x,
|
||||
const int nrows_x, const int row_stride_x, const int nchannels_x,
|
||||
const int nchannels_y, const int channel_stride_x, queue_ptr stream) {
|
||||
const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) {
|
||||
|
||||
const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
@@ -1835,7 +1834,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
||||
row_stride_x, channel_stride_x,
|
||||
row_stride_x, channel_stride_x, channel_stride_y,
|
||||
nchannels_y / nchannels_x, item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -2124,8 +2123,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
@@ -2171,8 +2170,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
@@ -2776,6 +2775,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
queue_ptr main_stream = ctx.stream();
|
||||
@@ -2786,8 +2786,9 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
|
||||
|
||||
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
|
||||
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
|
||||
const int64_t channel_stride_y = nb11 / sizeof(float);
|
||||
|
||||
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,channel_stride_y, main_stream);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -2841,8 +2842,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
float * dst_ddf = static_cast<float *>(dst->data);
|
||||
|
||||
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
|
||||
const size_t type_size_src0 = ggml_type_size(src0->type);
|
||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == type_size_src1);
|
||||
|
||||
// SRC1 strides
|
||||
int64_t s11 = nb11 / type_size_src1;
|
||||
@@ -2854,11 +2855,40 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
|
||||
// iterate tensor dims and find the slowest moving dim and stride
|
||||
int64_t last_dim=0;
|
||||
int64_t last_str=0;
|
||||
int64_t largest_str=0;
|
||||
for(int i = 0; i< 4; i++){
|
||||
// last stride is always the largest
|
||||
if(src1->nb[i] == largest_str){
|
||||
if(src1->ne[last_dim] == 1){
|
||||
last_str = i;
|
||||
last_dim = i;
|
||||
}
|
||||
}
|
||||
if(src1->nb[i] > largest_str){
|
||||
largest_str = src1->nb[i];
|
||||
last_str = i;
|
||||
last_dim = i;
|
||||
}
|
||||
|
||||
}
|
||||
#if GGML_SYCL_DNNL
|
||||
// oneDNN handles strided data and does not need overhead of get_to_fp16_nc_sycl
|
||||
const int64_t ne_src1 = src1->nb[last_str] * src1->ne[last_dim] / type_size_src1;
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
to_fp16_sycl(src1_f16, src1_f16_alloc.get(), ne_src1, queue);
|
||||
# else
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
|
||||
#endif
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
s11 = ne10;
|
||||
@@ -2892,38 +2922,89 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
|
||||
(const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
|
||||
int64_t str_a0 = nb00 / type_size_src0;
|
||||
int64_t str_a1 = nb01 / type_size_src0;
|
||||
int64_t str_a2 = nb02 / type_size_src0;
|
||||
|
||||
DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
|
||||
src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
|
||||
src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
|
||||
dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
|
||||
};
|
||||
int64_t str_b0 = nb10 / type_size_src1;
|
||||
int64_t str_b1 = nb11 / type_size_src1;
|
||||
int64_t str_b2 = nb12 / type_size_src1;
|
||||
|
||||
if (r2 == 1 && r3 == 1) {
|
||||
if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
|
||||
}
|
||||
else {
|
||||
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
||||
const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
|
||||
const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
|
||||
float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
|
||||
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
|
||||
auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
|
||||
const sycl::half *src1, float *dst,
|
||||
int64_t a0, int64_t a1, int64_t batcha,
|
||||
int64_t b0, int64_t b1, int64_t batchb,
|
||||
int64_t sa0, int64_t sa1, int64_t sa2,
|
||||
int64_t sb0, int64_t sb1, int64_t sb2,
|
||||
int64_t sd2) {
|
||||
bool supported_broadcast = batchb == batcha ? true
|
||||
: batchb == 1 || batcha == 1 ? true
|
||||
: false;
|
||||
if (supported_broadcast) {
|
||||
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2, src1,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), sb0, sb1, sb2, dst,
|
||||
DnnlGemmWrapper::to_dt<float>(), queue, batcha, batchb);
|
||||
} else {
|
||||
// iterate over batches from smaller set of matrices (matrix 0)
|
||||
int64_t batches0 = batcha;
|
||||
int64_t batches1 = batchb;
|
||||
|
||||
if (batches0 > batches1) {
|
||||
int64_t num_mul_mats = batches1;
|
||||
int64_t sub_batch = batches0 / num_mul_mats;
|
||||
// src0 is batched and bigger, shift and multiply with src1
|
||||
for (int64_t i0 = 0; i0 < num_mul_mats; i0++) {
|
||||
const sycl::half *src0_shifted = src0 + (sa2 * i0 * sub_batch);
|
||||
const sycl::half *src1_shifted = src1 + (sb2 * i0);
|
||||
float *dst_shifted = dst + (sd2 * i0 * sub_batch);
|
||||
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
|
||||
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
|
||||
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
|
||||
queue, sub_batch, 1);
|
||||
}
|
||||
} else {
|
||||
int64_t num_mul_mats = batches0;
|
||||
int64_t sub_batch = batches1 / num_mul_mats;
|
||||
// src1 is batched and bigger, shift and multiply with src0
|
||||
for (int64_t i1 = 0; i1 < num_mul_mats; i1++) {
|
||||
const sycl::half *src0_shifted = src0 + (sa2 * i1);
|
||||
const sycl::half *src1_shifted = src1 + (sb2 * i1 * sub_batch);
|
||||
float *dst_shifted = dst + (sd2 * i1 * sub_batch);
|
||||
DnnlGemmWrapper::gemm(ctx, a1, b1, a0, src0_shifted,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), sa0, sa1, sa2,
|
||||
src1_shifted, DnnlGemmWrapper::to_dt<sycl::half>(), sb0,
|
||||
sb1, sb2, dst_shifted, DnnlGemmWrapper::to_dt<float>(),
|
||||
queue, 1, sub_batch);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
bool cont_batches_a = nb02 * ne02 == nb03;
|
||||
bool cont_batches_b = nb12 * ne12 == nb13;
|
||||
if (cont_batches_a && cont_batches_b) {
|
||||
int64_t batches0 = ne02 * ne03;
|
||||
int64_t batches1 = ne12 * ne13;
|
||||
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
||||
ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
|
||||
str_b2, nb2 / sizeof(float));
|
||||
} else {
|
||||
for (int64_t b_a = 0; b_a < ne03; b_a++) {
|
||||
const sycl::half *src0_f16_shifted
|
||||
= src0_f16 + (nb03 * b_a / type_size_src0);
|
||||
const sycl::half *src1_f16_shifted
|
||||
= src1_f16 + (nb13 * b_a / type_size_src1);
|
||||
float *dst_shifted = dst_ddf + (nb3 * b_a / sizeof(float));
|
||||
int64_t batches0 = ne02;
|
||||
int64_t batches1 = ne12;
|
||||
launch_gemm_for_batches(src0_f16_shifted, src1_f16_shifted, dst_shifted,
|
||||
ne00, ne01, batches0, ne10, ne11, batches1, str_a0, str_a1,
|
||||
str_a2, str_b0, str_b1, str_b2, nb2 / sizeof(float));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// iterate over batches from smaller set of matrices (matrix 0)
|
||||
for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
|
||||
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
||||
const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
|
||||
const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
|
||||
float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
|
||||
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@@ -3263,10 +3344,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
}
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
@@ -4303,6 +4384,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
|
||||
@@ -6,46 +6,49 @@ static constexpr bool is_arithmetic_v() {
|
||||
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TIn, typename TOut>
|
||||
static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
|
||||
convert (const char* src, char* dst) {
|
||||
auto src_val = *reinterpret_cast<const TIn*>(src);
|
||||
auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
|
||||
*reinterpret_cast<TOut*>(dst) = dst_val;;
|
||||
*reinterpret_cast<TOut*>(dst) = dst_val;
|
||||
}
|
||||
|
||||
template<typename TIn, typename TOut>
|
||||
static void k_set_rows(
|
||||
const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t ne11, const int64_t ne12,
|
||||
const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t nb10, const size_t nb11, const size_t nb12,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
const size_t src_type_size, const size_t dst_type_size,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
const int64_t total_elements,
|
||||
const sycl::nd_item<1> & item_ct1) {
|
||||
|
||||
const int i03 = item_ct1.get_group(0);
|
||||
const int i02 = item_ct1.get_group(1);
|
||||
const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index
|
||||
|
||||
if (i01 >= ne01) {
|
||||
const int64_t i = item_ct1.get_global_linear_id();
|
||||
if (i >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i12 = i03 % ne12;
|
||||
const int i11 = i02 % ne11;
|
||||
const int i10 = i01;
|
||||
const int64_t i03 = i / (ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
||||
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
|
||||
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
|
||||
|
||||
const int64_t i12 = i03 % ne12;
|
||||
const int64_t i11 = i02 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
|
||||
const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
|
||||
|
||||
const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
|
||||
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
|
||||
const char * src_elem = src0_row + i00 * src_type_size;
|
||||
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
|
||||
char * dst_elem = dst_row_ptr + i00 * dst_type_size;
|
||||
|
||||
for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
|
||||
const char * src_elem = src0_row + col * src_type_size;
|
||||
char * dst_elem = dst_row_ptr + col * dst_type_size;
|
||||
convert<TIn, TOut>(src_elem, dst_elem);
|
||||
}
|
||||
convert<TIn, TOut>(src_elem, dst_elem);
|
||||
}
|
||||
|
||||
template<typename TIn, typename TOut>
|
||||
@@ -58,33 +61,30 @@ static void set_rows_sycl(
|
||||
const size_t src_type_size, const size_t dst_type_size,
|
||||
queue_ptr stream) {
|
||||
|
||||
constexpr int max_threads_per_row = 64; // KEEPING 64 for now
|
||||
const int threads_per_row = std::min((int)ne00, max_threads_per_row);
|
||||
const int64_t total_elements = ne00 * ne01 * ne02 * ne03;
|
||||
|
||||
constexpr int max_threads_per_block = 64;
|
||||
const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row);
|
||||
constexpr int block_size = 64;
|
||||
const int64_t grid_size = ceil_div(total_elements, block_size);
|
||||
|
||||
const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
|
||||
const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
|
||||
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
sycl::nd_range<3>(grid_size * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_set_rows<TIn, TOut>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, ne01, ne11, ne12,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
src_type_size, dst_type_size,
|
||||
item_ct1
|
||||
);
|
||||
}
|
||||
);
|
||||
sycl_parallel_for(
|
||||
stream,
|
||||
sycl::nd_range<1>(grid_size * block_size, block_size),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
k_set_rows<TIn, TOut>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, ne01, ne02,
|
||||
ne11, ne12,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
src_type_size, dst_type_size,
|
||||
total_elements,
|
||||
item_ct1
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
@@ -122,7 +122,7 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
nb1, nb2, nb3,
|
||||
sizeof(float), sizeof(sycl::half),
|
||||
stream
|
||||
);
|
||||
);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type!");
|
||||
|
||||
@@ -2835,10 +2835,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
return s;
|
||||
};
|
||||
|
||||
bool rte = device->float_controls_rte_fp16;
|
||||
#define CREATE_BINARY(name, namemod, spec) \
|
||||
for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name ## namemod[s0][s1][d], \
|
||||
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \
|
||||
#name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d][rte], name ## _data[s0][s1][d][rte], \
|
||||
"main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1);
|
||||
|
||||
CREATE_BINARY(add, , {0})
|
||||
@@ -2890,8 +2891,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
#undef CREATE_UNARY
|
||||
|
||||
#define CREATE_GLU(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true);
|
||||
if (device->float_controls_rte_fp16) { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
} else { \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||
}
|
||||
|
||||
CREATE_GLU(geglu)
|
||||
CREATE_GLU(reglu)
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
#version 450
|
||||
|
||||
#if RTE16
|
||||
#extension GL_EXT_spirv_intrinsics : enable
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif // RTE16
|
||||
|
||||
#include "rte.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#if defined(SET_ROWS) && QUANT_K == 1
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#include "rte.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#include "rte.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_spirv_intrinsics: enable
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#if RTE16
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif
|
||||
#include "rte.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_spirv_intrinsics: enable
|
||||
|
||||
#if RTE16
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif
|
||||
#include "rte.comp"
|
||||
|
||||
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
#if RTE16
|
||||
#extension GL_EXT_spirv_intrinsics : enable
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif // RTE16
|
||||
@@ -537,8 +537,10 @@ void process_shaders() {
|
||||
for (auto src0_f16 : {false, true}) {
|
||||
for (auto src1_f16 : {false, true}) {
|
||||
for (auto dst_f16 : {false, true}) {
|
||||
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16);
|
||||
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}});
|
||||
for (auto rte : {false, true}) {
|
||||
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : "");
|
||||
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -592,16 +594,19 @@ void process_shaders() {
|
||||
string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("geglu_erf_f16", "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_erf_f32", "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
for (auto rte : {false, true}) {
|
||||
std::string suffix = rte ? "_rte" : "";
|
||||
string_to_spv("geglu_f16" + suffix, "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_f32" + suffix, "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("reglu_f16" + suffix, "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("reglu_f32" + suffix, "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_f16" + suffix, "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("swiglu_f32" + suffix, "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_erf_f16" + suffix, "geglu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_erf_f32" + suffix, "geglu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_quick_f16" + suffix,"geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||
string_to_spv("geglu_quick_f32" + suffix,"geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}});
|
||||
}
|
||||
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
@@ -709,11 +714,59 @@ void write_output_files() {
|
||||
std::remove(path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
std::string suffixes[2] = {"_f32", "_f16"};
|
||||
for (const char *op : {"add", "sub", "mul", "div"}) {
|
||||
fprintf(hdr, "extern unsigned char *%s_data[2][2][2];\n", op);
|
||||
fprintf(hdr, "extern uint64_t %s_len[2][2][2];\n", op);
|
||||
fprintf(src, "unsigned char *%s_data[2][2][2] = {{{%s_f32_f32_f32_data, %s_f32_f32_f16_data}, {%s_f32_f16_f32_data, %s_f32_f16_f16_data}}, {{%s_f16_f32_f32_data, %s_f16_f32_f16_data}, {%s_f16_f16_f32_data, %s_f16_f16_f16_data}}};\n", op, op, op, op, op, op, op, op, op);
|
||||
fprintf(src, "uint64_t %s_len[2][2][2] = {{{%s_f32_f32_f32_len, %s_f32_f32_f16_len}, {%s_f32_f16_f32_len, %s_f32_f16_f16_len}}, {{%s_f16_f32_f32_len, %s_f16_f32_f16_len}, {%s_f16_f16_f32_len, %s_f16_f16_f16_len}}};\n", op, op, op, op, op, op, op, op, op);
|
||||
fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op);
|
||||
fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op);
|
||||
std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = ";
|
||||
std::string len = "uint64_t " + std::string(op) + "_len[2][2][2][2] = ";
|
||||
for (uint32_t t0 = 0; t0 < 2; ++t0) {
|
||||
if (t0 == 0) {
|
||||
data += "{";
|
||||
len += "{";
|
||||
}
|
||||
for (uint32_t t1 = 0; t1 < 2; ++t1) {
|
||||
if (t1 == 0) {
|
||||
data += "{";
|
||||
len += "{";
|
||||
}
|
||||
for (uint32_t t2 = 0; t2 < 2; ++t2) {
|
||||
if (t2 == 0) {
|
||||
data += "{";
|
||||
len += "{";
|
||||
}
|
||||
for (uint32_t rte = 0; rte < 2; ++rte) {
|
||||
if (rte == 0) {
|
||||
data += "{";
|
||||
len += "{";
|
||||
}
|
||||
data += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : "");
|
||||
len += op + suffixes[t0] + suffixes[t1] + suffixes[t2] + ((rte != 0) ? "_rte" : "");
|
||||
data += "_data,";
|
||||
len += "_len,";
|
||||
if (rte == 1) {
|
||||
data += "}, ";
|
||||
len += "}, ";
|
||||
}
|
||||
}
|
||||
if (t2 == 1) {
|
||||
data += "}, ";
|
||||
len += "}, ";
|
||||
}
|
||||
}
|
||||
if (t1 == 1) {
|
||||
data += "}, ";
|
||||
len += "}, ";
|
||||
}
|
||||
}
|
||||
if (t0 == 1) {
|
||||
data += "};\n";
|
||||
len += "};\n";
|
||||
}
|
||||
}
|
||||
fprintf(src, data.c_str());
|
||||
fprintf(src, len.c_str());
|
||||
}
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
|
||||
@@ -317,6 +317,7 @@ class MODEL_ARCH(IntEnum):
|
||||
PHI3 = auto()
|
||||
PHIMOE = auto()
|
||||
PLAMO = auto()
|
||||
PLAMO2 = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
@@ -631,6 +632,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.PHI3: "phi3",
|
||||
MODEL_ARCH.PHIMOE: "phimoe",
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.PLAMO2: "plamo2",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
@@ -1369,6 +1371,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PLAMO2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
MODEL_TENSOR.SSM_IN,
|
||||
MODEL_TENSOR.SSM_CONV1D,
|
||||
MODEL_TENSOR.SSM_X,
|
||||
MODEL_TENSOR.SSM_DT,
|
||||
MODEL_TENSOR.SSM_A,
|
||||
MODEL_TENSOR.SSM_D,
|
||||
MODEL_TENSOR.SSM_OUT,
|
||||
MODEL_TENSOR.SSM_DT_NORM,
|
||||
MODEL_TENSOR.SSM_B_NORM,
|
||||
MODEL_TENSOR.SSM_C_NORM,
|
||||
],
|
||||
MODEL_ARCH.GPT2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
|
||||
@@ -13,7 +13,7 @@ class TensorNameMap:
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
||||
"transformer.word_embeddings", # falcon
|
||||
"word_embeddings", # bloom
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 granite-hybrid
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert nomic-bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
@@ -63,7 +63,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe plamo2
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
@@ -77,7 +77,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
||||
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe
|
||||
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe plamo2
|
||||
"norm", # llama-pth
|
||||
"transformer.norm_f", # mpt dbrx
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
@@ -126,6 +126,7 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
"model.layers.layers.{bid}.pre_mixer_norm", # plamo2
|
||||
"model.layers.{bid}.attention_norm", # internlm2
|
||||
"model.layers.{bid}.norm", # mamba-qbert
|
||||
"backbone.layers.{bid}.norm", # mamba
|
||||
@@ -163,6 +164,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
"encoder.layers.{bid}.mixer.Wqkv", # jina
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"model.layers.layers.{bid}.mixer.qkv_proj", # plamo2
|
||||
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
"transformer_encoder.{bid}.qkv", # neobert
|
||||
@@ -233,6 +235,7 @@ class TensorNameMap:
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
"model.layers.layers.{bid}.mixer.o_proj", # plamo2
|
||||
"model.layers.{bid}.attention.wo", # internlm2
|
||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||
"encoder.layers.{bid}.mixer.out_proj", # jina
|
||||
@@ -255,8 +258,9 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
||||
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
||||
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
||||
"model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -286,6 +290,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.pre_moe_layernorm", # mini-jamba
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
@@ -298,6 +303,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
||||
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
|
||||
"model.layers.{bid}.feed_forward.up_proj",
|
||||
),
|
||||
|
||||
@@ -342,6 +348,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.layers.{bid}.mlp.gate_up_proj", # plamo2
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||
@@ -469,6 +476,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.q", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
@@ -479,6 +487,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.k", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
@@ -559,27 +568,31 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_IN: (
|
||||
"model.layers.{bid}.in_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.in_proj", # mamba
|
||||
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.in_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.in_proj", # mamba
|
||||
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_CONV1D: (
|
||||
"model.layers.{bid}.conv1d", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.conv1d", # mamba
|
||||
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.conv1d", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.conv1d", # mamba
|
||||
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.conv1d", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_X: (
|
||||
"model.layers.{bid}.x_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.x_proj", # mamba
|
||||
"model.layers.{bid}.mamba.x_proj", # jamba
|
||||
"model.layers.{bid}.x_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.x_proj", # mamba
|
||||
"model.layers.{bid}.mamba.x_proj", # jamba
|
||||
"model.layers.layers.{bid}.mixer.bcdt_proj", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT: (
|
||||
"model.layers.{bid}.dt_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.dt_proj", # mamba
|
||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.dt_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.dt_proj", # mamba
|
||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
@@ -587,25 +600,33 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_A: (
|
||||
"model.layers.{bid}.A_log", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.A_log", # mamba
|
||||
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.A_log", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.A_log", # mamba
|
||||
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.A_log", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_B_NORM: (
|
||||
"model.layers.{bid}.mamba.b_layernorm", # jamba
|
||||
"model.layers.{bid}.mamba.B_layernorm", # mini-jamba
|
||||
"model.layers.{bid}.mamba.b_layernorm", # jamba
|
||||
"model.layers.{bid}.mamba.B_layernorm", # mini-jamba
|
||||
"model.layers.layers.{bid}.mixer.B_norm.weight", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_C_NORM: (
|
||||
"model.layers.{bid}.mamba.c_layernorm", # jamba
|
||||
"model.layers.{bid}.mamba.C_layernorm", # mini-jamba
|
||||
"model.layers.{bid}.mamba.c_layernorm", # jamba
|
||||
"model.layers.{bid}.mamba.C_layernorm", # mini-jamba
|
||||
"model.layers.layers.{bid}.mixer.C_norm.weight", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_D: (
|
||||
"model.layers.{bid}.D", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.D", # mamba
|
||||
"model.layers.{bid}.mamba.D", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.D", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.D", # mamba
|
||||
"model.layers.{bid}.mamba.D", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.D", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_NORM: (
|
||||
@@ -614,9 +635,10 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_OUT: (
|
||||
"model.layers.{bid}.out_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.out_proj", # mamba
|
||||
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.{bid}.out_proj", # mamba-hf
|
||||
"backbone.layers.{bid}.mixer.out_proj", # mamba
|
||||
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.out_proj", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_W0: (
|
||||
|
||||
+7
-6
@@ -71,12 +71,13 @@ extern "C" {
|
||||
typedef int32_t llama_seq_id;
|
||||
|
||||
enum llama_vocab_type {
|
||||
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
||||
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
||||
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
{%- if not add_generation_prompt is defined -%}
|
||||
{%- set add_generation_prompt = true -%}
|
||||
{%- endif -%}
|
||||
{%- set ns = namespace(system_prompt='') -%}
|
||||
{%- for message in messages -%}
|
||||
{%- if message['role'] == 'system' -%}
|
||||
{%- set ns.system_prompt = message['content'] -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{bos_token}}
|
||||
{%- if ns.system_prompt != '' -%}
|
||||
{{- 'System: ' + ns.system_prompt + '\n\n' -}}
|
||||
{%- endif -%}
|
||||
{%- for message in messages -%}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{- 'User: ' + message['content']|trim + '\n\n' -}}
|
||||
{%- endif -%}
|
||||
{%- if message['role'] == 'assistant' and message['content'] is not none -%}
|
||||
{%- set content = message['content'] -%}
|
||||
{%- if '</think>' in content -%}
|
||||
{%- set content = content.split('</think>')[-1] -%}
|
||||
{%- endif -%}
|
||||
{{- 'Assistant: ' + content|trim + '\n\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- 'Assistant:' -}}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- ' <think>\n</think>' }}
|
||||
{%- endif %}
|
||||
{%- if enable_thinking is defined and enable_thinking is true %}
|
||||
{{- ' <think>' }}
|
||||
{%- endif %}
|
||||
{%- endif -%}
|
||||
@@ -3,6 +3,7 @@
|
||||
-r ../tools/server/tests/requirements.txt
|
||||
|
||||
-r ./requirements-compare-llama-bench.txt
|
||||
-r ./requirements-server-bench.txt
|
||||
-r ./requirements-pydantic.txt
|
||||
-r ./requirements-test-tokenizer-random.txt
|
||||
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
datasets~=3.2.0
|
||||
matplotlib~=3.10.0
|
||||
numpy~=1.26.4
|
||||
requests~=2.32.3
|
||||
tqdm~=4.67.1
|
||||
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
from time import sleep, time
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import requests
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||
logger = logging.getLogger("server-bench")
|
||||
|
||||
|
||||
def get_prompts(n_prompts: int) -> list[str]:
|
||||
logger.info("Loading MMLU dataset...")
|
||||
ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"] # type: ignore
|
||||
if n_prompts >= 0:
|
||||
ret = ret[:n_prompts]
|
||||
return ret
|
||||
|
||||
|
||||
def get_server(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int) -> dict:
|
||||
logger.info("Starting the llama.cpp server...")
|
||||
address = f"http://localhost:{port}"
|
||||
|
||||
popen_args: list[str] = [
|
||||
path_server,
|
||||
"--flash-attn",
|
||||
"--n-gpu-layers", str(n_gpu_layers),
|
||||
"--parallel", str(parallel),
|
||||
"--ctx-size", str(parallel * ctx_size),
|
||||
"--model", path_model,
|
||||
"--port", str(port),
|
||||
"--swa-full", # FIXME performance bad otherwise
|
||||
# "--attn-streams",
|
||||
]
|
||||
fout = open("bench.log", "w") if path_log is not None else subprocess.DEVNULL
|
||||
process = subprocess.Popen(popen_args, stdout=fout, stderr=subprocess.STDOUT)
|
||||
|
||||
n_failures: int = 0
|
||||
while True:
|
||||
try:
|
||||
sleep(1.0)
|
||||
exit_code = process.poll()
|
||||
if exit_code is not None:
|
||||
raise RuntimeError(f"llama.cpp server for {path_model} exited unexpectedly with exit code {exit_code}")
|
||||
response = requests.get(f"{address}/health")
|
||||
if response.status_code == 200:
|
||||
break
|
||||
except requests.ConnectionError:
|
||||
n_failures += 1
|
||||
if n_failures >= 10:
|
||||
raise RuntimeError(f"llama.cpp server for {path_model} is not healthy after 10 seconds")
|
||||
|
||||
return {"process": process, "address": address, "fout": fout}
|
||||
|
||||
|
||||
def get_prompt_length(data: dict) -> int:
|
||||
session = data["session"]
|
||||
server_address: str = data["server_address"]
|
||||
|
||||
response = session.post(
|
||||
f"{server_address}/apply-template",
|
||||
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
|
||||
prompt: str = json.loads(response.text)["prompt"]
|
||||
response = session.post(
|
||||
f"{server_address}/tokenize",
|
||||
json={"content": prompt, "add_special": True}
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
|
||||
tokens: list[str] = json.loads(response.text)["tokens"]
|
||||
return len(tokens)
|
||||
|
||||
|
||||
def send_prompt(data: dict) -> tuple[float, list[float]]:
|
||||
session = data["session"]
|
||||
server_address: str = data["server_address"]
|
||||
|
||||
response = session.post(
|
||||
f"{server_address}/apply-template",
|
||||
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
|
||||
prompt: str = json.loads(response.text)["prompt"]
|
||||
|
||||
json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
|
||||
response = session.post(f"{server_address}/completion", json=json_data, stream=True)
|
||||
|
||||
last_valid_line: str = ""
|
||||
token_arrival_times: list[float] = []
|
||||
for line in response.iter_lines(decode_unicode=True):
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
last_valid_line = line
|
||||
token_arrival_times.append(time())
|
||||
token_arrival_times = token_arrival_times[:-1]
|
||||
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
|
||||
timings: dict = json.loads(last_valid_line[6:])["timings"]
|
||||
|
||||
return (timings["prompt_ms"], token_arrival_times)
|
||||
|
||||
|
||||
def benchmark(path_server: str, path_model: str, path_log: Optional[str], port: int, n_gpu_layers: int, parallel: int, ctx_size: int, n_prompts: int, n_predict: int):
|
||||
num_workers: int = parallel + 1
|
||||
prompts: list[str] = get_prompts(n_prompts)
|
||||
|
||||
server: Optional[dict] = None
|
||||
session = None
|
||||
try:
|
||||
server = get_server(path_server, path_model, path_log, port, n_gpu_layers, parallel, ctx_size)
|
||||
server_address: str = server["address"]
|
||||
|
||||
adapter = requests.adapters.HTTPAdapter(pool_connections=num_workers, pool_maxsize=num_workers) # type: ignore
|
||||
session = requests.Session()
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
|
||||
data: list[dict] = []
|
||||
for i, p in enumerate(prompts):
|
||||
data.append({"session": session, "server_address": server_address, "prompt": p, "n_predict": n_predict, "seed": i})
|
||||
|
||||
logger.info("Getting the prompt lengths...")
|
||||
prompt_n = [get_prompt_length(d) for d in data]
|
||||
|
||||
logger.info("Starting the benchmark...\n")
|
||||
t0 = time()
|
||||
results: list[tuple[int, list[float]]] = thread_map(send_prompt, data, max_workers=num_workers, chunksize=1)
|
||||
finally:
|
||||
if server is not None:
|
||||
server["process"].terminate()
|
||||
server["process"].wait()
|
||||
if session is not None:
|
||||
session.close()
|
||||
|
||||
prompt_ms = []
|
||||
token_t = []
|
||||
depth_sum: int = 0
|
||||
for pn, (pms, tat) in zip(prompt_n, results):
|
||||
prompt_ms.append(pms)
|
||||
token_t += tat
|
||||
n_tokens: int = len(tat)
|
||||
depth_sum += n_tokens * pn
|
||||
depth_sum += n_tokens * (n_tokens + 1) // 2
|
||||
prompt_n = np.array(prompt_n, dtype=np.int64)
|
||||
prompt_ms = np.array(prompt_ms, dtype=np.float64)
|
||||
token_t = np.array(token_t, dtype=np.float64)
|
||||
|
||||
token_t -= t0
|
||||
token_t_last = np.max(token_t)
|
||||
|
||||
logger.info("")
|
||||
logger.info(f"Benchmark duration: {token_t_last:.2f} s")
|
||||
logger.info(f"Request throughput: {n_prompts / token_t_last:.2f} requests/s = {n_prompts / (token_t_last/60):.2f} requests/min")
|
||||
logger.info(f"Total prompt length: {np.sum(prompt_n)} tokens")
|
||||
logger.info(f"Average prompt length: {np.mean(prompt_n):.2f} tokens")
|
||||
logger.info(f"Average prompt latency: {np.mean(prompt_ms):.2f} ms")
|
||||
logger.info(f"Average prompt speed: {np.sum(prompt_n) / (1e-3 * np.sum(prompt_ms)):.2f} tokens/s")
|
||||
logger.info(f"Total generated tokens: {token_t.shape[0]}")
|
||||
logger.info(f"Average generation depth: {depth_sum / token_t.shape[0]:.2f} tokens")
|
||||
logger.info(f"Average total generation speed: {token_t.shape[0] / token_t_last:.2f} tokens/s")
|
||||
logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
|
||||
|
||||
plt.figure()
|
||||
plt.scatter(prompt_n, prompt_ms, s=10.0, marker=".", alpha=0.25)
|
||||
plt.xlim(0, 1.05 * np.max(prompt_n))
|
||||
plt.ylim(0, 1.05 * np.max(prompt_ms))
|
||||
plt.title(path_model)
|
||||
plt.xlabel("Prompt length [tokens]")
|
||||
plt.ylabel("Time to first token [ms]")
|
||||
plt.savefig("prompt_time.png", dpi=240)
|
||||
|
||||
bin_max = np.ceil(token_t_last) + 1
|
||||
plt.figure()
|
||||
plt.hist(token_t, np.arange(0, bin_max))
|
||||
plt.xlim(0, bin_max + 1)
|
||||
plt.title(path_model)
|
||||
plt.xlabel("Time [s]")
|
||||
plt.ylabel("Num. tokens generated per second")
|
||||
plt.savefig("gen_rate.png", dpi=240)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
|
||||
"Results are printed to console and visualized as plots (saved to current working directory).")
|
||||
parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
|
||||
parser.add_argument("--path_model", type=str, required=True, help="Path to the model to use for the benchmark")
|
||||
parser.add_argument("--path_log", type=str, default=None, help="Path to the model to use for the benchmark")
|
||||
parser.add_argument("--port", type=int, default=18725, help="Port to use for the server during the benchmark")
|
||||
parser.add_argument("--n_gpu_layers", type=int, default=999, help="Number of GPU layers for the server")
|
||||
parser.add_argument("--parallel", type=int, default=16, help="Number of slots for the server")
|
||||
parser.add_argument("--ctx_size", type=int, default=4096, help="Server context size per slot")
|
||||
parser.add_argument("--n_prompts", type=int, default=1000, help="Number of prompts to evaluate")
|
||||
parser.add_argument("--n_predict", type=int, default=2048, help="Max. number of tokens to predict per prompt")
|
||||
args = parser.parse_args()
|
||||
benchmark(**vars(args))
|
||||
@@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_PLAMO2, "plamo2" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
@@ -784,6 +785,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PLAMO2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
|
||||
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
|
||||
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_CODESHELL,
|
||||
{
|
||||
@@ -2094,6 +2125,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_JAMBA:
|
||||
case LLM_ARCH_FALCON_H1:
|
||||
case LLM_ARCH_PLAMO2:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_LFM2:
|
||||
return true;
|
||||
|
||||
@@ -38,6 +38,7 @@ enum llm_arch {
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_PLAMO2,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
|
||||
+1
-1
@@ -170,7 +170,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
||||
} else if (tmpl_contains("rwkv-world")) {
|
||||
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
|
||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||
|
||||
+13
-2
@@ -731,7 +731,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int32_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
|
||||
@@ -791,10 +792,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
||||
|
||||
// extract logits
|
||||
if (logits && t_logits) {
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
||||
GGML_ASSERT(backend_res != nullptr);
|
||||
GGML_ASSERT(logits != nullptr);
|
||||
|
||||
ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (t_embd) {
|
||||
if (embd && t_embd) {
|
||||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
|
||||
@@ -935,6 +935,33 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// Load Mamba SSM parameters
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
||||
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 16: type = LLM_TYPE_1B; break;
|
||||
case 32:
|
||||
if (hparams.n_embd == 2048) {
|
||||
type = LLM_TYPE_2B;
|
||||
} else if (hparams.n_embd == 4096) {
|
||||
type = LLM_TYPE_8B;
|
||||
}
|
||||
break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -2938,6 +2965,73 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO2:
|
||||
{
|
||||
const uint32_t d_conv = hparams.ssm_d_conv;
|
||||
const uint32_t d_state = hparams.ssm_d_state;
|
||||
const uint32_t num_heads = hparams.ssm_dt_rank;
|
||||
const uint32_t intermediate_size = hparams.ssm_d_inner;
|
||||
const uint32_t head_dim = intermediate_size / num_heads;
|
||||
const uint32_t qk_dim = head_dim;
|
||||
const uint32_t v_dim = head_dim;
|
||||
const int64_t num_attention_heads = hparams.n_head();
|
||||
const int64_t q_num_heads = num_attention_heads;
|
||||
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
bool is_mamba_layer = hparams.is_recurrent(i);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (is_mamba_layer) {
|
||||
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
|
||||
|
||||
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
|
||||
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
|
||||
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
|
||||
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
|
||||
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
|
||||
|
||||
layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
|
||||
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
|
||||
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
|
||||
} else {
|
||||
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
||||
const int64_t k_num_heads = num_key_value_heads;
|
||||
const int64_t v_num_heads = num_key_value_heads;
|
||||
const int64_t q_proj_dim = q_num_heads * qk_dim;
|
||||
const int64_t k_proj_dim = k_num_heads * qk_dim;
|
||||
const int64_t v_proj_dim = v_num_heads * v_dim;
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
|
||||
}
|
||||
|
||||
// All layers have post-attention norm, FFN norm, and FFN tensors
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -5209,6 +5303,7 @@ void llama_model::print_info() const {
|
||||
arch == LLM_ARCH_MAMBA2 ||
|
||||
arch == LLM_ARCH_JAMBA ||
|
||||
arch == LLM_ARCH_FALCON_H1 ||
|
||||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
@@ -15476,6 +15571,320 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_plamo2 : public llm_graph_context_mamba {
|
||||
llm_build_plamo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
// {n_embd, n_tokens}
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
cb(inpL, "embedding_output", -1);
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_hybrid = build_inp_mem_hybrid();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * residual = inpL;
|
||||
|
||||
// ggml_graph_add_node(gf, model.layers[il].attn_norm);
|
||||
// cb(model.layers[il].attn_norm, "attn_norm", il);
|
||||
|
||||
// pre_mixer_norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
|
||||
// check if this layer is Mamba or Attention
|
||||
bool is_mamba_layer = hparams.is_recurrent(il);
|
||||
|
||||
if (is_mamba_layer) {
|
||||
// PLaMo-2 Mamba layer
|
||||
cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
|
||||
} else {
|
||||
// PLaMo-2 Attention layer
|
||||
cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, gf, cur, model, il);
|
||||
}
|
||||
|
||||
// post_mixer_norm
|
||||
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
// residual connection
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "attn_residual", il);
|
||||
residual = cur;
|
||||
|
||||
// pre-ffn norm
|
||||
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_pre_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// post ffn norm
|
||||
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||
}
|
||||
|
||||
// residual connection
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "ffn_residual", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// final norm
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
// Explicitly mark as output tensor to ensure proper backend assignment
|
||||
ggml_set_output(cur);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
private:
|
||||
ggml_tensor * build_plamo2_attn_layer(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
ggml_tensor * inp_pos,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
const llama_model & model,
|
||||
int il) {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// PLaMo-2 uses combined QKV tensor
|
||||
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(qkv, "qkv", il);
|
||||
|
||||
// split QKV tensor into Q, K, V
|
||||
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
int32_t n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
const int64_t q_offset = 0;
|
||||
const int64_t k_offset = n_embd_head_q * n_head;
|
||||
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cur = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
|
||||
}
|
||||
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * build_plamo2_mamba_layer(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
const llama_model & model,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) {
|
||||
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
|
||||
const int64_t d_conv = hparams.ssm_d_conv;
|
||||
const int64_t d_inner = hparams.ssm_d_inner;
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
const int64_t n_heads = hparams.ssm_dt_rank;
|
||||
const int64_t head_dim = d_inner / n_heads;
|
||||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
||||
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
||||
|
||||
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
||||
|
||||
// in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
|
||||
cb(zx, "mamba_in_proj", il);
|
||||
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
||||
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
||||
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
||||
cb(zx, "mamba_in_proj_out", il);
|
||||
|
||||
// split into z and x
|
||||
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
||||
x = ggml_cont(ctx0, x);
|
||||
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
||||
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
||||
cb(x, "mamba_x_split", il);
|
||||
|
||||
ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
|
||||
cb(z, "mamba_z_split", il);
|
||||
|
||||
// conv1d
|
||||
{
|
||||
// => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
|
||||
x = ggml_view_2d(ctx0, x, d_inner, n_seq_tokens * n_seqs, d_inner * x->nb[0], 0);
|
||||
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
|
||||
cb(conv_x, "mamba_conv1d_input", il);
|
||||
|
||||
// copy last (d_conv - 1) columns back into the state cache
|
||||
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
|
||||
conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
||||
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, last_conv,
|
||||
ggml_view_1d(ctx0, conv_states_all,
|
||||
(d_conv - 1)*(d_inner)*(n_seqs),
|
||||
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
|
||||
|
||||
// 1D convolution
|
||||
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
||||
cb(x, "mamba_conv1d", il);
|
||||
|
||||
x = ggml_silu(ctx0, x);
|
||||
cb(x, "mamba_conv1d_silu", il);
|
||||
}
|
||||
|
||||
// SSM
|
||||
{
|
||||
// bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
|
||||
cb(x_bcdt, "mamba_bcdt_proj", il);
|
||||
|
||||
// split into dt, B, C
|
||||
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
||||
ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
|
||||
ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
|
||||
ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
|
||||
cb(B, "mamba_B_raw", il);
|
||||
cb(C, "mamba_C_raw", il);
|
||||
cb(dt, "mamba_dt_raw", il);
|
||||
|
||||
// Apply RMS norm to dt, B, C (PLaMo-2 specific)
|
||||
B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
|
||||
C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
|
||||
dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(B, "mamba_B_normed", il);
|
||||
cb(C, "mamba_C_normed", il);
|
||||
cb(dt, "mamba_dt_normed", il);
|
||||
|
||||
// dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
|
||||
dt = build_lora_mm(model.layers[il].ssm_dt, dt);
|
||||
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
||||
cb(dt, "mamba_dt_proj", il);
|
||||
|
||||
ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
|
||||
cb(A, "mamba_A", il);
|
||||
|
||||
x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
||||
B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
|
||||
C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
|
||||
|
||||
// use the states and the indices provided by build_recurrent_state
|
||||
// (this is necessary in order to properly use the states before they are overwritten,
|
||||
// while avoiding to make unnecessary copies of the states)
|
||||
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
||||
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
|
||||
|
||||
// Custom operator to optimize the parallel associative scan
|
||||
// as described in the Annex D of the Mamba paper.
|
||||
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
||||
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
||||
};
|
||||
|
||||
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
||||
cb(y_ssm, "mamba_ssm_scan", il);
|
||||
|
||||
// store last states
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
||||
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
|
||||
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
||||
|
||||
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
||||
cb(y, "mamba_y_view", il);
|
||||
|
||||
// Add D parameter and apply gating with z
|
||||
// {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
|
||||
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
|
||||
cb(y, "mamba_y_add_d", il);
|
||||
|
||||
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
||||
cb(y, "mamba_y_swiglu_z", il);
|
||||
|
||||
// out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
||||
y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
||||
cb(cur, "mamba_out_proj", il);
|
||||
}
|
||||
|
||||
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
||||
cb(cur, "mamba_out", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_arcee : public llm_graph_context {
|
||||
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -16262,6 +16671,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_plamo>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_plamo2>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
|
||||
@@ -16651,6 +17064,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_PLAMO2:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
|
||||
+1
-2
@@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
||||
if (qtype != new_type) {
|
||||
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
||||
new_type = qtype;
|
||||
break; // if two or more types are specified for the tensor, first match wins
|
||||
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+341
-8
@@ -11,6 +11,7 @@
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <forward_list>
|
||||
@@ -1196,6 +1197,284 @@ private:
|
||||
const llm_tokenizer_rwkv & tokenizer;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_plamo2 : llm_tokenizer {
|
||||
llm_tokenizer_plamo2(const llama_vocab & vocab) {
|
||||
build(vocab);
|
||||
}
|
||||
|
||||
void build(const llama_vocab & vocab) {
|
||||
// Reset internal structures
|
||||
tokens_.clear();
|
||||
bytes_.assign(256, 0);
|
||||
to_suffix_id_.clear();
|
||||
table_.clear();
|
||||
|
||||
// Build token list and byte mapping
|
||||
std::unordered_map<std::string, float> suffix_to_score;
|
||||
std::unordered_map<std::string, llama_token> token_to_id;
|
||||
|
||||
for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
|
||||
const auto & entry = vocab.get_token_data(token_id);
|
||||
tokens_.push_back(entry.text);
|
||||
token_to_id[entry.text] = static_cast<llama_token>(token_id);
|
||||
|
||||
// Handle byte tokens
|
||||
if (vocab.is_byte(token_id)) {
|
||||
if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
|
||||
std::string hex_str = entry.text.substr(3, 2);
|
||||
int byte_val = std::stoi(hex_str, nullptr, 16);
|
||||
bytes_[byte_val] = static_cast<llama_token>(token_id);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add token and all its suffixes to suffix_to_score
|
||||
suffix_to_score[entry.text] = entry.score;
|
||||
|
||||
// Extract suffixes character by character (UTF-8 aware)
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
|
||||
for (size_t i = 1; i < cpts.size(); ++i) {
|
||||
std::string suffix;
|
||||
for (size_t j = i; j < cpts.size(); ++j) {
|
||||
suffix += unicode_cpt_to_utf8(cpts[j]);
|
||||
}
|
||||
if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
|
||||
suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that all byte tokens are set
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
if (bytes_[i] == 0) {
|
||||
throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
|
||||
}
|
||||
}
|
||||
|
||||
// Build suffix list in lexicographical order of reversed strings
|
||||
std::vector<std::string> suffixes;
|
||||
for (const auto & pair : suffix_to_score) {
|
||||
suffixes.push_back(pair.first);
|
||||
}
|
||||
suffixes.push_back(""); // Empty suffix
|
||||
|
||||
std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
|
||||
std::string rev_a(a.rbegin(), a.rend());
|
||||
std::string rev_b(b.rbegin(), b.rend());
|
||||
return rev_a < rev_b;
|
||||
});
|
||||
|
||||
// Build suffix_to_id and to_suffix_id_
|
||||
std::unordered_map<std::string, int32_t> suffix_to_id;
|
||||
int32_t num_pieces = 0;
|
||||
|
||||
for (const auto & suffix : suffixes) {
|
||||
suffix_to_id[suffix] = num_pieces;
|
||||
if (!suffix.empty()) {
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
||||
|
||||
std::string remaining;
|
||||
for (size_t i = 1; i < cpts.size(); ++i) {
|
||||
remaining += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
|
||||
int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
|
||||
to_suffix_id_[piece_code] = num_pieces;
|
||||
|
||||
// Count number of pieces for this suffix
|
||||
int32_t pieces_for_suffix = 1; // sentinel row
|
||||
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
||||
std::string piece;
|
||||
for (int32_t i = 0; i < piece_length; ++i) {
|
||||
piece += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
if (suffix_to_score.find(piece) != suffix_to_score.end()) {
|
||||
pieces_for_suffix++;
|
||||
}
|
||||
}
|
||||
num_pieces += pieces_for_suffix;
|
||||
} else {
|
||||
num_pieces++; // Empty suffix contributes one piece (sentinel row)
|
||||
}
|
||||
}
|
||||
|
||||
// Build flattened table
|
||||
table_.resize(num_pieces, std::vector<int32_t>(4, 0));
|
||||
int32_t table_idx = 0;
|
||||
|
||||
for (const auto & suffix : suffixes) {
|
||||
// Add all prefixes of the suffix to the table (in decreasing order of length)
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
||||
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
||||
std::string piece;
|
||||
for (int32_t i = 0; i < piece_length; ++i) {
|
||||
piece += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
|
||||
auto score_it = suffix_to_score.find(piece);
|
||||
if (score_it == suffix_to_score.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
|
||||
auto token_it = token_to_id.find(piece);
|
||||
table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
|
||||
|
||||
float score = score_it->second;
|
||||
table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
|
||||
static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
|
||||
table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
|
||||
|
||||
table_idx++;
|
||||
}
|
||||
|
||||
// Add sentinel row
|
||||
table_[table_idx][TABLE_PIECE_LENGTH] = 1;
|
||||
table_[table_idx][TABLE_TOKEN_ID] = -1;
|
||||
table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
|
||||
table_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_token> encode(const std::string & text) const {
|
||||
std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
|
||||
// Skip the first code point if it is a BOM (Byte Order Mark)
|
||||
if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
|
||||
unicode_data.erase(unicode_data.begin());
|
||||
}
|
||||
|
||||
if (unicode_data.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const size_t data_len = unicode_data.size();
|
||||
|
||||
// Initialize scores array (dynamic programming)
|
||||
std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
|
||||
scores[data_len] = 0;
|
||||
|
||||
// Path array to track best tokenization
|
||||
std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
|
||||
|
||||
int32_t suffix_id = 0;
|
||||
|
||||
// Process from end to beginning
|
||||
for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
|
||||
uint32_t c = unicode_data[i];
|
||||
|
||||
// Find next suffix ID
|
||||
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
||||
int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
|
||||
auto it = to_suffix_id_.find(piece_code);
|
||||
suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
|
||||
|
||||
if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Update best path
|
||||
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
||||
int32_t score = table_[p][TABLE_SCORE];
|
||||
if (score > INVALID_SCORE) {
|
||||
int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
|
||||
int64_t s = scores[i + piece_length] - score;
|
||||
|
||||
if (s < scores[i]) {
|
||||
scores[i] = s;
|
||||
path[i][PATH_TOKEN_LENGTH] = piece_length;
|
||||
path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
|
||||
path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
|
||||
|
||||
if (score == UNKNOWN_SCORE) {
|
||||
// Add UTF-8 byte count
|
||||
path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (score == UNKNOWN_SCORE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Decode the best path
|
||||
std::vector<llama_token> token_ids;
|
||||
token_ids.reserve(path[0][PATH_NUM_TOKENS]);
|
||||
|
||||
int pos = 0;
|
||||
while (pos < static_cast<int>(data_len)) {
|
||||
if (path[pos][PATH_TOKEN_ID] >= 0) {
|
||||
token_ids.push_back(path[pos][PATH_TOKEN_ID]);
|
||||
} else {
|
||||
// Fall back to byte tokens
|
||||
uint32_t c = unicode_data[pos];
|
||||
int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
||||
|
||||
for (int i = 0; i < s; ++i) {
|
||||
uint8_t b;
|
||||
if (s == 1) {
|
||||
b = c;
|
||||
} else {
|
||||
if (i == 0) {
|
||||
b = (0xF00 >> s) & 0xFF;
|
||||
} else {
|
||||
b = 0x80;
|
||||
}
|
||||
}
|
||||
token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
|
||||
}
|
||||
}
|
||||
|
||||
assert(path[pos][PATH_TOKEN_LENGTH] > 0);
|
||||
pos += path[pos][PATH_TOKEN_LENGTH];
|
||||
}
|
||||
|
||||
return token_ids;
|
||||
}
|
||||
private:
|
||||
// Constants for table structure
|
||||
static constexpr int32_t TABLE_PIECE_LENGTH = 0;
|
||||
static constexpr int32_t TABLE_TOKEN_ID = 1;
|
||||
static constexpr int32_t TABLE_SCORE = 2;
|
||||
static constexpr int32_t TABLE_PIECE_ID = 3;
|
||||
|
||||
// Constants for path array
|
||||
static constexpr int32_t PATH_TOKEN_LENGTH = 0;
|
||||
static constexpr int32_t PATH_TOKEN_ID = 1;
|
||||
static constexpr int32_t PATH_NUM_TOKENS = 2;
|
||||
|
||||
// Score constants
|
||||
static constexpr int32_t INVALID_SCORE = -20000000;
|
||||
static constexpr int32_t UNKNOWN_SCORE = -10000000;
|
||||
|
||||
// List of tokens in the vocabulary
|
||||
std::vector<std::string> tokens_;
|
||||
|
||||
// Mapping from byte code point to token ID (for byte fallback)
|
||||
std::vector<llama_token> bytes_;
|
||||
|
||||
// Mapping from piece code to suffix ID
|
||||
std::unordered_map<int64_t, int32_t> to_suffix_id_;
|
||||
|
||||
// Flattened table representing the Trie structure
|
||||
// Each row contains: [piece_length, token_id, score, piece_id]
|
||||
std::vector<std::vector<int32_t>> table_;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_plamo2_session {
|
||||
llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
std::vector<llama_token> tokens = tokenizer.encode(text);
|
||||
output.insert(output.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
|
||||
private:
|
||||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
//
|
||||
// impl
|
||||
//
|
||||
@@ -1499,6 +1778,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
special_unk_id = LLAMA_TOKEN_NULL;
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = LLAMA_TOKEN_NULL;
|
||||
} else if (tokenizer_model == "plamo2") {
|
||||
type = LLAMA_VOCAB_TYPE_PLAMO2;
|
||||
|
||||
// PLaMo-2 default special tokens (these will be overridden by model config)
|
||||
special_bos_id = 1; // <|plamo:bos|>
|
||||
special_eos_id = 2; // <|plamo:eos|>
|
||||
special_unk_id = 0; // <|plamo:unk|>
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = 3; // <|plamo:pad|>
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||
}
|
||||
@@ -2145,13 +2434,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
|
||||
|
||||
std::string llama_vocab::impl::type_name() const{
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2234,6 +2524,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
||||
case LLAMA_VOCAB_TYPE_RWKV:
|
||||
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
|
||||
break;
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2:
|
||||
tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported vocab type");
|
||||
}
|
||||
@@ -2566,6 +2859,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
|
||||
session.tokenize(text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2:
|
||||
{
|
||||
llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
@@ -2664,6 +2974,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||
memcpy(buf, result.data(), result.size());
|
||||
return (int)result.size();
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
||||
// PLaMo-2 uses similar token handling as BPE/SPM
|
||||
if (vocab.is_byte(token)) {
|
||||
// Handle byte tokens like <0xXX>
|
||||
if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
|
||||
int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
|
||||
if (length < 1) {
|
||||
return -1;
|
||||
}
|
||||
buf[0] = static_cast<char>(hex_val);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal token - just copy the text
|
||||
std::string result = token_text;
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -2908,6 +3236,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
||||
// PLaMo-2 uses byte tokens in format <0xXX>
|
||||
char hex_str[8];
|
||||
snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
|
||||
return pimpl->token_to_id.at(hex_str);
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -3385,4 +3719,3 @@ int32_t llama_detokenize(
|
||||
bool unparse_special) {
|
||||
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
// increase backlog size to avoid connection resets for >> 1 slots
|
||||
#define CPPHTTPLIB_LISTEN_BACKLOG 512
|
||||
// disable Nagle's algorithm
|
||||
#define CPPHTTPLIB_TCP_NODELAY true
|
||||
#include <cpp-httplib/httplib.h>
|
||||
|
||||
Reference in New Issue
Block a user