mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ed54e32558 | |||
| a972faebed | |||
| 550cf726e1 | |||
| c252ce67c4 | |||
| 70cd37dbbe | |||
| acc1b008cf | |||
| 7057faf64b | |||
| fe1c92cd7b | |||
| e68aa10d8f | |||
| 0a16bf52e6 | |||
| 88021565f0 | |||
| 56920f5665 | |||
| b0d52998b9 | |||
| f28d4f4ac9 | |||
| 9fcb29f22f | |||
| 5ef22d281d | |||
| 233d773d02 | |||
| a885dcff11 | |||
| 663027fd54 | |||
| cf0e3ba150 | |||
| d413dca003 | |||
| 85ca66a746 | |||
| 3976dfbe00 | |||
| d36e61c580 | |||
| c97b5e5854 | |||
| 267e99867f | |||
| 3b15924d71 | |||
| 79bc429262 | |||
| c4df49a42d | |||
| 3c3635d2f2 | |||
| 61bdfd5298 | |||
| 01806e7771 |
@@ -16,6 +16,9 @@
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
||||
- Let authors, who are also collaborators, merge their own PRs
|
||||
- When merging a PR by a contributor, make sure you have a good understanding of the changes
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
# Coding guidelines
|
||||
|
||||
|
||||
+139
@@ -631,6 +631,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
||||
@@ -698,11 +699,13 @@ static void parse_json_tool_calls(
|
||||
size_t from = std::string::npos;
|
||||
auto first = true;
|
||||
while (true) {
|
||||
auto start_pos = builder.pos();
|
||||
auto res = function_regex_start_only && first
|
||||
? builder.try_consume_regex(*function_regex_start_only)
|
||||
: function_regex
|
||||
? builder.try_find_regex(*function_regex, from)
|
||||
: std::nullopt;
|
||||
|
||||
if (res) {
|
||||
std::string name;
|
||||
if (get_function_name) {
|
||||
@@ -737,6 +740,8 @@ static void parse_json_tool_calls(
|
||||
return;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
} else {
|
||||
builder.move_to(start_pos);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1388,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// Pass thinking context for DeepSeek V3.1 template
|
||||
json additional_context = {
|
||||
{"thinking", inputs.enable_thinking},
|
||||
};
|
||||
|
||||
auto prompt = apply(tmpl, inputs,
|
||||
/* messages_override= */ inputs.messages,
|
||||
/* tools_override= */ std::nullopt,
|
||||
additional_context);
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
|
||||
if (string_ends_with(data.prompt, "<think>")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
data.thinking_forced_open = true;
|
||||
}
|
||||
}
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
||||
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
|
||||
"\" " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"<|tool▁call▁end|>\""));
|
||||
});
|
||||
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
||||
// so we accept common variants (then it's all constrained)
|
||||
builder.add_rule("root",
|
||||
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
||||
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
|
||||
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
||||
"\"<|tool▁calls▁end|>\""
|
||||
" space");
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
||||
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
||||
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
||||
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<|tool▁calls▁begin|>",
|
||||
"<|tool▁call▁begin|>",
|
||||
"<|tool▁sep|>",
|
||||
"<|tool▁call▁end|>",
|
||||
"<|tool▁calls▁end|>",
|
||||
};
|
||||
});
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
||||
builder.try_parse_reasoning("<think>", "</think>");
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
@@ -1409,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
||||
tool_calls_end);
|
||||
}
|
||||
|
||||
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
|
||||
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
|
||||
|
||||
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
|
||||
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
|
||||
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
|
||||
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
||||
|
||||
parse_json_tool_calls(
|
||||
builder,
|
||||
/* block_open= */ tool_calls_begin,
|
||||
/* function_regex_start_only= */ std::nullopt,
|
||||
function_regex,
|
||||
close_regex,
|
||||
tool_calls_end);
|
||||
}
|
||||
|
||||
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
|
||||
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
||||
// First try to parse using the standard reasoning parsing method
|
||||
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
||||
|
||||
auto start_pos = builder.pos();
|
||||
auto found_end_think = builder.try_find_literal("</think>");
|
||||
builder.move_to(start_pos);
|
||||
|
||||
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
||||
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
||||
common_chat_parse_deepseek_v3_1_content(builder);
|
||||
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
||||
// If reasoning was parsed successfully, the remaining content is regular content
|
||||
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
||||
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
|
||||
common_chat_parse_deepseek_v3_1_content(builder);
|
||||
} else {
|
||||
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
||||
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
||||
common_chat_parse_deepseek_v3_1_content(builder);
|
||||
return;
|
||||
}
|
||||
// If no reasoning tags found, check if we should treat everything as reasoning
|
||||
if (builder.syntax().thinking_forced_open) {
|
||||
// If thinking is forced open but no tags found, treat everything as reasoning
|
||||
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
||||
builder.add_reasoning_content(builder.consume_rest());
|
||||
} else {
|
||||
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
||||
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
|
||||
common_chat_parse_deepseek_v3_1_content(builder);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
auto prompt = apply(tmpl, inputs);
|
||||
@@ -2365,6 +2495,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
}
|
||||
}
|
||||
|
||||
// DeepSeek V3.1: detect based on specific patterns in the template
|
||||
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
|
||||
params.json_schema.is_null()) {
|
||||
return common_chat_params_init_deepseek_v3_1(tmpl, params);
|
||||
}
|
||||
|
||||
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
|
||||
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_deepseek_r1(tmpl, params);
|
||||
@@ -2537,6 +2673,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
||||
common_chat_parse_deepseek_r1(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
|
||||
common_chat_parse_deepseek_v3_1(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
||||
common_chat_parse_functionary_v3_2(builder);
|
||||
break;
|
||||
|
||||
@@ -107,6 +107,7 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_GRANITE,
|
||||
|
||||
@@ -843,9 +843,10 @@ public:
|
||||
_build_object_rule(
|
||||
properties, required, name,
|
||||
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
||||
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
|
||||
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
||||
std::unordered_set<std::string> required;
|
||||
std::vector<std::pair<std::string, json>> properties;
|
||||
std::map<std::string, size_t> enum_values;
|
||||
std::string hybrid_name = name;
|
||||
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
||||
if (comp_schema.contains("$ref")) {
|
||||
@@ -857,6 +858,14 @@ public:
|
||||
required.insert(prop.key());
|
||||
}
|
||||
}
|
||||
} else if (comp_schema.contains("enum")) {
|
||||
for (const auto & v : comp_schema["enum"]) {
|
||||
const auto rule = _generate_constant_rule(v);
|
||||
if (enum_values.find(rule) == enum_values.end()) {
|
||||
enum_values[rule] = 0;
|
||||
}
|
||||
enum_values[rule] += 1;
|
||||
}
|
||||
} else {
|
||||
// todo warning
|
||||
}
|
||||
@@ -870,6 +879,17 @@ public:
|
||||
add_component(t, true);
|
||||
}
|
||||
}
|
||||
if (!enum_values.empty()) {
|
||||
std::vector<std::string> enum_intersection;
|
||||
for (const auto & p : enum_values) {
|
||||
if (p.second == schema["allOf"].size()) {
|
||||
enum_intersection.push_back(p.first);
|
||||
}
|
||||
}
|
||||
if (!enum_intersection.empty()) {
|
||||
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
||||
}
|
||||
}
|
||||
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
||||
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
||||
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
||||
|
||||
@@ -5128,6 +5128,20 @@ class EmbeddingGemma(Gemma3Model):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
|
||||
# constructor. We want to use the value from the original model's config.json.
|
||||
# ref: https://github.com/huggingface/transformers/pull/40700
|
||||
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
orig_sliding_window = config.get("sliding_window")
|
||||
if orig_sliding_window is None:
|
||||
raise ValueError("sliding_window not found in model config - this is required for the model")
|
||||
|
||||
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
|
||||
f"instead of {self.hparams['sliding_window']}")
|
||||
self.gguf_writer.add_sliding_window(orig_sliding_window)
|
||||
|
||||
self._try_set_pooling_type()
|
||||
|
||||
|
||||
|
||||
@@ -28,6 +28,15 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
return str;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
@@ -43,6 +52,8 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
|
||||
v = (float) *(int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_BF16) {
|
||||
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@@ -586,9 +586,10 @@ class SchemaConverter:
|
||||
properties = list(schema.get('properties', {}).items())
|
||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
|
||||
|
||||
elif schema_type in (None, 'object') and 'allOf' in schema:
|
||||
elif schema_type in (None, 'object', 'string') and 'allOf' in schema:
|
||||
required = set()
|
||||
properties = []
|
||||
enum_sets = []
|
||||
hybrid_name = name
|
||||
def add_component(comp_schema, is_required):
|
||||
if (ref := comp_schema.get('$ref')) is not None:
|
||||
@@ -600,6 +601,9 @@ class SchemaConverter:
|
||||
if is_required:
|
||||
required.add(prop_name)
|
||||
|
||||
if 'enum' in comp_schema:
|
||||
enum_sets.append(set(comp_schema['enum']))
|
||||
|
||||
for t in schema['allOf']:
|
||||
if 'anyOf' in t:
|
||||
for tt in t['anyOf']:
|
||||
@@ -607,6 +611,15 @@ class SchemaConverter:
|
||||
else:
|
||||
add_component(t, is_required=True)
|
||||
|
||||
if enum_sets:
|
||||
enum_intersection = enum_sets[0]
|
||||
for s in enum_sets[1:]:
|
||||
enum_intersection &= s
|
||||
|
||||
if enum_intersection:
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
|
||||
return self._add_rule(rule_name, rule)
|
||||
|
||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
||||
|
||||
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.6.0
|
||||
torchvision~=0.21.0
|
||||
transformers~=4.55.0
|
||||
huggingface-hub~=0.34.0
|
||||
torch
|
||||
torchvision
|
||||
transformers
|
||||
huggingface-hub
|
||||
accelerate
|
||||
|
||||
@@ -9,15 +9,134 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
### If you want to dump RoPE activations, apply this monkey patch to the model
|
||||
### class from Transformers that you are running (replace apertus.modeling_apertus
|
||||
### with the proper package and class for your model
|
||||
### === START ROPE DEBUG ===
|
||||
# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
# orig_rope = apply_rotary_pos_emb
|
||||
# torch.set_printoptions(threshold=float('inf'))
|
||||
# torch.set_printoptions(precision=6, sci_mode=False)
|
||||
|
||||
# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||
# # log inputs
|
||||
# summarize(q, "RoPE.q_in")
|
||||
# summarize(k, "RoPE.k_in")
|
||||
|
||||
# # call original
|
||||
# q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
|
||||
|
||||
# # log outputs
|
||||
# summarize(q_out, "RoPE.q_out")
|
||||
# summarize(k_out, "RoPE.k_out")
|
||||
|
||||
# return q_out, k_out
|
||||
|
||||
# # Patch it
|
||||
# import transformers.models.apertus.modeling_apertus as apertus_mod # noqa: E402
|
||||
# apertus_mod.apply_rotary_pos_emb = debug_rope
|
||||
### == END ROPE DEBUG ===
|
||||
|
||||
|
||||
def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
|
||||
"""
|
||||
Print a tensor in llama.cpp debug style.
|
||||
|
||||
Supports:
|
||||
- 2D tensors (seq, hidden)
|
||||
- 3D tensors (batch, seq, hidden)
|
||||
- 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
|
||||
|
||||
Shows first and last max_vals of each vector per sequence position.
|
||||
"""
|
||||
t = tensor.detach().to(torch.float32).cpu()
|
||||
|
||||
# Determine dimensions
|
||||
if t.ndim == 3:
|
||||
_, s, _ = t.shape
|
||||
elif t.ndim == 2:
|
||||
_, s = 1, t.shape[0]
|
||||
t = t.unsqueeze(0)
|
||||
elif t.ndim == 4:
|
||||
_, s, _, _ = t.shape
|
||||
else:
|
||||
print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
|
||||
return
|
||||
|
||||
ten_shape = t.shape
|
||||
|
||||
print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}")
|
||||
print(" [")
|
||||
print(" [")
|
||||
|
||||
# Determine indices for first and last sequences
|
||||
first_indices = list(range(min(s, max_seq)))
|
||||
last_indices = list(range(max(0, s - max_seq), s))
|
||||
|
||||
# Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
|
||||
has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
|
||||
|
||||
# Combine indices
|
||||
if has_overlap:
|
||||
# If there's overlap, just use the combined unique indices
|
||||
indices = sorted(list(set(first_indices + last_indices)))
|
||||
separator_index = None
|
||||
else:
|
||||
# If no overlap, we'll add a separator between first and last sequences
|
||||
indices = first_indices + last_indices
|
||||
separator_index = len(first_indices)
|
||||
|
||||
for i, si in enumerate(indices):
|
||||
# Add separator if needed
|
||||
if separator_index is not None and i == separator_index:
|
||||
print(" ...")
|
||||
|
||||
# Extract appropriate slice
|
||||
vec = t[0, si]
|
||||
if vec.ndim == 2: # 4D case: flatten heads × dim_per_head
|
||||
flat = vec.flatten().tolist()
|
||||
else: # 2D or 3D case
|
||||
flat = vec.tolist()
|
||||
|
||||
# First and last slices
|
||||
first = flat[:max_vals]
|
||||
last = flat[-max_vals:] if len(flat) >= max_vals else flat
|
||||
first_str = ", ".join(f"{v:12.4f}" for v in first)
|
||||
last_str = ", ".join(f"{v:12.4f}" for v in last)
|
||||
|
||||
print(f" [{first_str}, ..., {last_str}]")
|
||||
|
||||
print(" ],")
|
||||
print(" ]")
|
||||
print(f" sum = {t.sum().item():.6f}\n")
|
||||
|
||||
|
||||
def debug_hook(name):
|
||||
def fn(_m, input, output):
|
||||
if isinstance(input, torch.Tensor):
|
||||
summarize(input, name + "_in")
|
||||
elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
|
||||
summarize(input[0], name + "_in")
|
||||
if isinstance(output, torch.Tensor):
|
||||
summarize(output, name + "_out")
|
||||
elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
|
||||
summarize(output[0], name + "_out")
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||
parser.add_argument("--model-path", "-m", help="Path to the model")
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = os.environ.get('MODEL_PATH', args.model_path)
|
||||
model_path = os.environ.get("MODEL_PATH", args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
|
||||
parser.error(
|
||||
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
|
||||
)
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
@@ -34,18 +153,30 @@ config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
unreleased_module_path = (
|
||||
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
)
|
||||
class_name = f"{unreleased_model_name}ForCausalLM"
|
||||
print(f"Importing unreleased model module: {unreleased_module_path}")
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
|
||||
model_class = getattr(
|
||||
importlib.import_module(unreleased_module_path), class_name
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
model_path
|
||||
) # Note: from_pretrained, not fromPretrained
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, device_map="auto", offload_folder="offload"
|
||||
)
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if len(list(module.children())) == 0: # only leaf modules
|
||||
module.register_forward_hook(debug_hook(name))
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
# Printing the Model class to allow for easier debugging. This can be useful
|
||||
|
||||
@@ -134,6 +134,7 @@ extern "C" {
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
|
||||
+6
-1
@@ -1404,6 +1404,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// note: casting from f32 to i32 will discard the fractional part
|
||||
GGML_API struct ggml_tensor * ggml_cast(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1528,7 +1529,11 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// supports 3D: a->ne[2] == b->ne[1]
|
||||
// supports 4D a:
|
||||
// a [n_embd, ne1, ne2, ne3]
|
||||
// b I32 [n_rows, ne2, ne3, 1]
|
||||
//
|
||||
// return [n_embd, n_rows, ne2, ne3]
|
||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // data
|
||||
|
||||
@@ -114,6 +114,9 @@ extern "C" {
|
||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// wait for an event on on a different stream
|
||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
// (optional) sort/optimize the nodes in the graph
|
||||
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
|
||||
@@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
|
||||
backend->iface.event_wait(backend, event);
|
||||
}
|
||||
|
||||
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(backend);
|
||||
if (backend->iface.optimize_graph != NULL) {
|
||||
backend->iface.optimize_graph(backend, cgraph);
|
||||
}
|
||||
}
|
||||
|
||||
// Backend device
|
||||
|
||||
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
||||
@@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||
|
||||
// Optimize this split of the graph. This needs to happen before we make graph_copy,
|
||||
// so they are in sync.
|
||||
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
|
||||
|
||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||
|
||||
@@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
|
||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_blas_guid(void) {
|
||||
|
||||
@@ -2092,16 +2092,17 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
cann_ctx_src->stream()));
|
||||
|
||||
// record event on src stream after the copy
|
||||
if (!cann_ctx_src->copy_event) {
|
||||
ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
||||
}
|
||||
ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
||||
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
|
||||
// if (!cann_ctx_src->copy_event) {
|
||||
// ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
||||
// }
|
||||
// ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
||||
|
||||
// wait on dst stream for the copy to complete
|
||||
ggml_cann_set_device(cann_ctx_dst->device);
|
||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
||||
// // wait on dst stream for the copy to complete
|
||||
// ggml_cann_set_device(cann_ctx_dst->device);
|
||||
// ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
|
||||
} else {
|
||||
// src and dst are on the same backend
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
@@ -2689,6 +2690,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
||||
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
||||
/* .event_record = */ ggml_backend_cann_event_record,
|
||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
__vector float srcv [8];
|
||||
__vector float asrcv[8];
|
||||
__vector float amaxv[8];
|
||||
float32x4_t srcv [8];
|
||||
float32x4_t asrcv[8];
|
||||
float32x4_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
||||
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
||||
const __vector int32_t vi = vec_signed(v);
|
||||
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
||||
const int32x4_t vi = vec_signed(v);
|
||||
|
||||
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
||||
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
||||
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
__vector float srcv [8];
|
||||
__vector float asrcv[8];
|
||||
__vector float amaxv[8];
|
||||
float32x4_t srcv [8];
|
||||
float32x4_t asrcv[8];
|
||||
float32x4_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
||||
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
__vector int32_t acc = vec_splats(0);
|
||||
int32x4_t acc = vec_splats(0);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
||||
const __vector int32_t vi = vec_signed(v);
|
||||
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
||||
const int32x4_t vi = vec_signed(v);
|
||||
|
||||
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
||||
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
||||
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
__vector float acc = vec_splats(0.0f);
|
||||
float32x4_t acc = vec_splats(0.0f);
|
||||
|
||||
const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
|
||||
const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
|
||||
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
||||
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
|
||||
const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
|
||||
const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
|
||||
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
||||
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
||||
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
||||
|
||||
const __vector int8_t v_xls = vec_sub(v_xl, v_s);
|
||||
const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
|
||||
const int8x16_t v_xls = vec_sub(v_xl, v_s);
|
||||
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
|
||||
|
||||
const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
|
||||
const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
||||
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
||||
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
||||
|
||||
const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
|
||||
const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
|
||||
const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
||||
const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
|
||||
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
|
||||
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
|
||||
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
||||
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
|
||||
|
||||
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
||||
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
||||
|
||||
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
|
||||
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
|
||||
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
|
||||
sumf = vec_hsum_f32x4(acc);
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
||||
|
||||
sumf = vec_hsum_f32x4(acc) + summs;
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
||||
}
|
||||
|
||||
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
|
||||
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
|
||||
|
||||
#pragma GCC unroll 4
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
|
||||
|
||||
sumf += vec_hsum(v_acc);
|
||||
sumf += vec_hsum_f32x4(v_acc);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
||||
}
|
||||
|
||||
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
|
||||
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
|
||||
|
||||
#pragma GCC unroll 4
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
|
||||
|
||||
sumf += vec_hsum(v_acc) + summs;
|
||||
sumf += vec_hsum_f32x4(v_acc) + summs;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
__vector float acc = vec_splats(0.0f);
|
||||
float32x4_t acc = vec_splats(0.0f);
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
sumf = vec_hsum_f32x4(acc);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
||||
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
||||
|
||||
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
|
||||
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
|
||||
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
|
||||
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
|
||||
isum += vec_hsum_i32x4(isum0) * scale[0];
|
||||
isum += vec_hsum_i32x4(isum1) * scale[1];
|
||||
isum += vec_hsum_i32x4(isum2) * scale[2];
|
||||
isum += vec_hsum_i32x4(isum3) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
||||
|
||||
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
||||
sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
|
||||
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
|
||||
|
||||
v_y[0] = vec_xl(0 , y0);
|
||||
v_y[1] = vec_xl(16, y0);
|
||||
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
||||
|
||||
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
||||
sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
|
||||
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
|
||||
}
|
||||
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
||||
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
||||
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
||||
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
||||
const int32_t mins = vec_hsum_i32x4(v_mins);
|
||||
|
||||
const uint8_t * scales = (const uint8_t *)utmp;
|
||||
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
||||
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
||||
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
||||
|
||||
sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
|
||||
sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
|
||||
sumi += vec_hsum_i32x4(sumi0) * *scales++;
|
||||
sumi += vec_hsum_i32x4(sumi1) * *scales++;
|
||||
}
|
||||
|
||||
sumf += d * sumi - dmin * mins;
|
||||
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
||||
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
||||
|
||||
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
|
||||
const int32_t mins = vec_hsum_i32x4(v_mins);
|
||||
|
||||
int32_t isum = 0;
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
||||
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
||||
|
||||
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
||||
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
||||
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
||||
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
||||
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
||||
vec_hsum_i32x4(summs1) * scale[1] +
|
||||
vec_hsum_i32x4(summs2) * scale[2] +
|
||||
vec_hsum_i32x4(summs3) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
||||
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
||||
|
||||
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
|
||||
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
|
||||
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
|
||||
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
|
||||
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
||||
vec_hsum_i32x4(summs1) * scale[1] +
|
||||
vec_hsum_i32x4(summs2) * scale[2] +
|
||||
vec_hsum_i32x4(summs3) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
}
|
||||
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
||||
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
h >>= 4;
|
||||
|
||||
sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
|
||||
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
|
||||
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
|
||||
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
|
||||
@@ -483,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
||||
/**
|
||||
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
||||
*/
|
||||
inline static float vec_hsum(float32x4_t v) {
|
||||
inline static float vec_hsum_f32x4(float32x4_t v) {
|
||||
float32x4_t v_temp = v + vec_reve(v);
|
||||
return v_temp[0] + v_temp[1];
|
||||
}
|
||||
|
||||
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
||||
int32x4_t v_temp = v + vec_reve(v);
|
||||
return v_temp[0] + v_temp[1];
|
||||
}
|
||||
|
||||
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
||||
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
||||
|
||||
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_I32] = {
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
||||
},
|
||||
};
|
||||
|
||||
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
||||
@@ -2696,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
|
||||
if (ggml_is_quantized(node->type) ||
|
||||
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
||||
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
||||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
|
||||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
|
||||
// conversion between F32 and I32
|
||||
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
|
||||
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
|
||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||
}
|
||||
} break;
|
||||
@@ -3258,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
for (; i < n; ++i) {
|
||||
y[i] = x[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
@@ -348,8 +349,10 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total = pages * page_size;
|
||||
|
||||
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
|
||||
*free = *total;
|
||||
#endif
|
||||
#endif // _WIN32
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
@@ -154,7 +154,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
return compute_forward_q4_0(params, dst);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
return compute_forward_kv_cache(params, dst);
|
||||
return compute_forward_fp16(params, dst);
|
||||
}
|
||||
} else if (dst->op == GGML_OP_GET_ROWS) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
@@ -164,7 +164,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
@@ -534,13 +534,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
|
||||
op->src[0]->op == GGML_OP_VIEW &&
|
||||
(op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
|
||||
op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[0] != 2) ||
|
||||
(op->src[1]->nb[0] != 4) ||
|
||||
(op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
|
||||
id += ne00 * (ne01 - ir1);
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_I32) {
|
||||
size_t id = 0;
|
||||
int32_t * dst_ptr = (int32_t *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = 0; i02 < ne02; i02++) {
|
||||
id += ne00 * ir0;
|
||||
for (int i01 = ir0; i01 < ir1; i01++) {
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
dst_ptr[id] = *src0_ptr;
|
||||
id++;
|
||||
}
|
||||
}
|
||||
id += ne00 * (ne01 - ir1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
}
|
||||
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_I32) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
i10 += ne00 * ir0;
|
||||
while (i10 >= ne0) {
|
||||
i10 -= ne0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||
|
||||
*(int32_t *) dst_ptr = *(const float *) src0_ptr;
|
||||
|
||||
if (++i10 == ne0) {
|
||||
i10 = 0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
i10 += ne00 * (ne01 - ir1);
|
||||
while (i10 >= ne0) {
|
||||
i10 -= ne0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_dup_i32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
// row range for this thread
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
// dst counters
|
||||
|
||||
int64_t i10 = 0;
|
||||
int64_t i11 = 0;
|
||||
int64_t i12 = 0;
|
||||
int64_t i13 = 0;
|
||||
|
||||
// TODO: not optimal, but works
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
i10 += ne00 * ir0;
|
||||
while (i10 >= ne0) {
|
||||
i10 -= ne0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||
|
||||
*(float *) dst_ptr = *(const int32_t *) src0_ptr;
|
||||
|
||||
if (++i10 == ne0) {
|
||||
i10 = 0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
i10 += ne00 * (ne01 - ir1);
|
||||
while (i10 >= ne0) {
|
||||
i10 -= ne0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
}
|
||||
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
|
||||
{
|
||||
ggml_compute_forward_dup_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
ggml_compute_forward_dup_i32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
|
||||
|
||||
@@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND)
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmf*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "template-instances/fattn-vec*.cu")
|
||||
|
||||
@@ -38,6 +38,8 @@ template<typename dst_t, typename src_t>
|
||||
return __float2bfloat16(float(x));
|
||||
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
|
||||
return __bfloat162float(x);
|
||||
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
||||
return int32_t(x);
|
||||
} else {
|
||||
return float(x);
|
||||
}
|
||||
|
||||
@@ -374,6 +374,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
|
||||
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
@@ -1,371 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
__shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
|
||||
|
||||
half kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -HALF_MAX_HALF;
|
||||
}
|
||||
half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
|
||||
|
||||
half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ half2 Q_h2[ncols][D/2];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
half kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
|
||||
half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
half sum;
|
||||
if (use_logit_softcap) {
|
||||
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
sum = logit_softcap * tanhf(tmp.x + tmp.y);
|
||||
} else {
|
||||
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
|
||||
const half2 val = h2exp(diff);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
|
||||
KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
|
||||
half2 V_k[(D/2)/WARP_SIZE][2];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
|
||||
V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const half sink = __float2half(sinksf[head]);
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||
kqsum_j = warp_reduce_sum((float)kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
|
||||
if (gridDim.y == 1) {
|
||||
dst_val /= __half2half2(kqsum_j);
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = __half22float2(dst_val);
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int cols_per_block, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,379 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
|
||||
|
||||
__shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
|
||||
float2 * KV_tmp2 = (float2 *) KV_tmp;
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
|
||||
float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ float Q_f[ncols][D];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
||||
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
||||
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
|
||||
const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
|
||||
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
|
||||
float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
|
||||
float Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(tmp);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
|
||||
float2 V_k[(D/2)/WARP_SIZE];
|
||||
float KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
|
||||
if (gridDim.y == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
template <int cols_per_block, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,596 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
|
||||
#define FATTN_TILE_NTHREADS 256
|
||||
|
||||
static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) {
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (fast_fp16_available(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
case 256:
|
||||
return 32;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
case 256:
|
||||
return 32;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // GGML_USE_HIP
|
||||
GGML_UNUSED_VARS(ncols, warp_size);
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 2*warp_size : 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 2*warp_size;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 256:
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // GGML_USE_HIP
|
||||
GGML_UNUSED_VARS(ncols, warp_size);
|
||||
}
|
||||
|
||||
template<int D, int ncols, bool use_logit_softcap> // D == head size
|
||||
#ifdef GGML_USE_HIP
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 1)
|
||||
#else
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 2)
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_tile(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int warp_size = 32;
|
||||
constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size);
|
||||
static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size.");
|
||||
constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size);
|
||||
static_assert(kq_nbatch % (2*warp_size) == 0, "bad kq_nbatch");
|
||||
|
||||
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
__shared__ float KQ[ncols][kq_stride];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
__shared__ half2 Q_tmp[ncols][D/2];
|
||||
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + 1)]; // Padded to avoid memory bank conflicts.
|
||||
half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#else
|
||||
__shared__ float Q_tmp[ncols][D];
|
||||
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + 1)]; // Padded to avoid memory bank conflicts.
|
||||
float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
|
||||
float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale);
|
||||
#else
|
||||
Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale;
|
||||
Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) {
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
|
||||
const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1 + threadIdx.x] = tmp_h2;
|
||||
#else
|
||||
const float2 tmp_f2 = __half22float2(tmp_h2);
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; ++k_KQ_1) {
|
||||
half2 K_k[kq_stride/warp_size];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; ++k_KQ_1) {
|
||||
float K_k[kq_stride/warp_size];
|
||||
float Q_k[ncols/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1];
|
||||
#else
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_f [i_KQ*(kq_nbatch + 1) + k_KQ_1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1];
|
||||
#else
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0 + k_KQ_1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]);
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += tmp.x + tmp.y;
|
||||
#else
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (k_KQ_0 + kq_nbatch < D) {
|
||||
__syncthreads(); // Sync not needed on last iteration.
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max<warp_size>(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j][i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j][i] = val;
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D;
|
||||
static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter");
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) {
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) {
|
||||
const int k_tile = k1 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[k_tile*(D/2) + i] = tmp;
|
||||
#else
|
||||
KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 V_k[(D/2)/warp_size];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
#else
|
||||
float2 V_k[(D/2)/warp_size];
|
||||
float KQ_k[ncols/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i];
|
||||
#else
|
||||
V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float tmp = KQ[j][k0 + k1];
|
||||
KQ_k[j0/nwarps] = make_half2(tmp, tmp);
|
||||
#else
|
||||
KQ_k[j0/nwarps] = KQ[j][k0 + k1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps];
|
||||
#else
|
||||
VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max<warp_size>(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
}
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum<warp_size>(kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += warp_size) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]);
|
||||
#else
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
if (gridDim.y == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
template <int D, bool use_logit_softcap>
|
||||
static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = 32;
|
||||
const int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
|
||||
if (Q->ne[1] > 16) {
|
||||
constexpr int cols_per_block = 32;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 16;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
|
||||
}
|
||||
|
||||
template <bool use_logit_softcap>
|
||||
static void launch_fattn_tile_switch_head_size(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
launch_fattn_tile_switch_ncols< 64, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
case 128: {
|
||||
launch_fattn_tile_switch_ncols<128, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
case 256: {
|
||||
launch_fattn_tile_switch_ncols<256, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("Unsupported head size");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,8 +1,7 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-mma-f16.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
@@ -271,8 +270,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
|
||||
// Best FlashAttention kernel for a specific GPU:
|
||||
enum best_fattn_kernel {
|
||||
BEST_FATTN_KERNEL_NONE = 0,
|
||||
BEST_FATTN_KERNEL_TILE_F32 = 200,
|
||||
BEST_FATTN_KERNEL_TILE_F16 = 210,
|
||||
BEST_FATTN_KERNEL_TILE = 200,
|
||||
BEST_FATTN_KERNEL_VEC_F32 = 100,
|
||||
BEST_FATTN_KERNEL_VEC_F16 = 110,
|
||||
BEST_FATTN_KERNEL_WMMA_F16 = 300,
|
||||
@@ -411,10 +409,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes:
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
return BEST_FATTN_KERNEL_TILE_F16;
|
||||
}
|
||||
return BEST_FATTN_KERNEL_TILE_F32;
|
||||
return BEST_FATTN_KERNEL_TILE;
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -422,11 +417,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
|
||||
case BEST_FATTN_KERNEL_NONE:
|
||||
GGML_ABORT("fatal error");
|
||||
case BEST_FATTN_KERNEL_TILE_F32:
|
||||
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_TILE_F16:
|
||||
ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
|
||||
case BEST_FATTN_KERNEL_TILE:
|
||||
ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_VEC_F32:
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
|
||||
@@ -2,39 +2,39 @@
|
||||
#include "dequantize.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#define MAX_GRIDDIM_Y 65535
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void k_get_rows(
|
||||
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
|
||||
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = z / ne12; // TODO fastdiv
|
||||
const int i12 = z % ne12;
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
|
||||
const int ib = i00/qk; // block index
|
||||
const int iqs = (i00%qk)/qr; // quant index
|
||||
const int iybs = i00 - i00%qk; // dst block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
const int ib = i00/qk; // block index
|
||||
const int iqs = (i00%qk)/qr; // quant index
|
||||
const int iybs = i00 - i00%qk; // dst block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
float2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
// dequantize
|
||||
float2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
|
||||
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
|
||||
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,27 +42,29 @@ template<typename src0_t, typename dst_t>
|
||||
static __global__ void k_get_rows_float(
|
||||
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
|
||||
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = z / ne12; // TODO fastdiv
|
||||
const int i12 = z % ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
|
||||
}
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +100,7 @@ static void get_rows_cuda_q(
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12);
|
||||
const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX));
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
@@ -116,7 +118,7 @@ static void get_rows_cuda_q(
|
||||
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/*ne10,*/ ne11, ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
@@ -131,7 +133,7 @@ static void get_rows_cuda_float(
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12);
|
||||
const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX));
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
@@ -147,7 +149,7 @@ static void get_rows_cuda_float(
|
||||
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/*ne10,*/ ne11, ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
|
||||
@@ -2109,6 +2109,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
@@ -3135,6 +3140,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cuda_guid() {
|
||||
@@ -3461,6 +3467,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||
return true;
|
||||
}
|
||||
@@ -3574,9 +3586,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_PAD:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#pragma once
|
||||
// This file contains primitives that expose the tensor core PTX instructions for CUDA code.
|
||||
// The primitives can be used in a similar way as the nvcuda::wmma interface but with a well-defined memory layout.
|
||||
// The documentation for the PTX instructions can be found under:
|
||||
|
||||
+34
-348
@@ -1,343 +1,12 @@
|
||||
#include "ggml.h"
|
||||
#include "common.cuh"
|
||||
#include "mma.cuh"
|
||||
#include "mmf.cuh"
|
||||
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMF_ROWS_PER_BLOCK 32
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
static __global__ void mul_mat_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int tile_k_padded = warp_size + 4;
|
||||
constexpr int ntA = rows_per_block / tile_A::I;
|
||||
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
|
||||
|
||||
const int row0 = blockIdx.x * rows_per_block;
|
||||
const int channel_dst = blockIdx.y;
|
||||
const int channel_x = channel_dst / channel_ratio;
|
||||
const int channel_y = channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int sample_x = sample_dst / sample_ratio;
|
||||
const int sample_y = sample_dst;
|
||||
|
||||
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ;
|
||||
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
|
||||
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
|
||||
tile_C C[ntA][ntB];
|
||||
|
||||
T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded);
|
||||
|
||||
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
|
||||
tile_A A[ntA][warp_size / tile_A::J];
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < tile_A::I; ++i) {
|
||||
tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
|
||||
load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
|
||||
tile_B B;
|
||||
load_ldmatrix(B, tile_xy + k0, tile_k_padded);
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
mma(C[itA][itB], A[itA][k0/tile_B::J], B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float * buf_iw = (float *) data_mmv;
|
||||
constexpr int kiw = nwarps*rows_per_block + 4;
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
|
||||
const int j = itB*tile_C::J + tile_C::get_j(l);
|
||||
buf_iw[j*kiw + i] = C[itA][itB].x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
|
||||
return;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
static_assert(rows_per_block == warp_size, "need loop/check");
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum += buf_iw[j*kiw + i];
|
||||
}
|
||||
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(x, y, ids, dst,
|
||||
ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
template <typename T, int cols_per_block>
|
||||
static void mul_mat_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
|
||||
GGML_ASSERT(!ids && "mul_mat_id not implemented");
|
||||
|
||||
GGML_ASSERT(ncols_x % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(stride_col_y % 2 == 0);
|
||||
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_dst / nsamples_x;
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
|
||||
int64_t nwarps_best = 1;
|
||||
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
|
||||
int64_t max_block_size = 256;
|
||||
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
|
||||
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
|
||||
if (niter < niter_best) {
|
||||
niter_best = niter;
|
||||
nwarps_best = nwarps;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
||||
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
|
||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
|
||||
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
||||
const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst);
|
||||
const dim3 block_dims(warp_size, nwarps_best, 1);
|
||||
switch (nwarps_best) {
|
||||
case 1: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 1><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 2><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 3><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 4><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 5><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 6><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 7><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f<T, rows_per_block, cols_per_block, 8><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void mul_mat_f_switch_cols_per_block(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 9: {
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 10: {
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 11: {
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 12: {
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 13: {
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 14: {
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 15: {
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 16: {
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
@@ -365,55 +34,72 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const int64_t ids_s0 = ids ? ids->nb[0] / ggml_type_size(ids->type) : 0;
|
||||
const int64_t ids_s1 = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
|
||||
|
||||
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
|
||||
const int64_t ncols_dst = ids ? ne2 : ne1;
|
||||
const int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
const int64_t stride_col_dst = ids ? s2 : s1;
|
||||
const int64_t stride_col_y = ids ? s12 : s11;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
|
||||
int64_t stride_channel_y = ids ? s11 : s12;
|
||||
int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
|
||||
//mul_mat_id: handle broadcast
|
||||
if (ids && nchannels_y == 1) {
|
||||
stride_channel_y = 0;
|
||||
nchannels_y = ids->ne[0];
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
constexpr int vals_per_T = 1;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half2 * src0_d = (const half2 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
|
||||
constexpr int vals_per_T = 2;
|
||||
mul_mat_f_switch_cols_per_block(
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
|
||||
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
|
||||
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) {
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) {
|
||||
|
||||
if (ggml_is_quantized(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
|
||||
return false;
|
||||
}
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
return false;
|
||||
}
|
||||
if (ne11 > 16) {
|
||||
if (src1_ncols > 16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return ampere_mma_available(cc);
|
||||
|
||||
+469
-1
@@ -1,5 +1,473 @@
|
||||
#pragma once
|
||||
|
||||
#include "mma.cuh"
|
||||
#include "common.cuh"
|
||||
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMF_ROWS_PER_BLOCK 32
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11);
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols);
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
static __global__ void mul_mat_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
|
||||
const int stride_col_id, const int stride_row_id,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int tile_k_padded = warp_size + 4;
|
||||
constexpr int ntA = rows_per_block / tile_A::I;
|
||||
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
|
||||
|
||||
const int row0 = blockIdx.x * rows_per_block;
|
||||
|
||||
const int expert_idx = has_ids ? blockIdx.y : 0;
|
||||
const int channel_dst = has_ids ? 0 : blockIdx.y;
|
||||
|
||||
const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio);
|
||||
const int channel_y = channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int sample_x = sample_dst / sample_ratio;
|
||||
const int sample_y = sample_dst;
|
||||
|
||||
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ;
|
||||
y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y);
|
||||
dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst);
|
||||
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
|
||||
char * shmem_base = data_mmv;
|
||||
int * slot_map = (int *) shmem_base;
|
||||
char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base;
|
||||
|
||||
tile_C C[ntA][ntB];
|
||||
|
||||
T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded);
|
||||
|
||||
if constexpr (has_ids) {
|
||||
__shared__ int has_any;
|
||||
if (threadIdx.y == 0) {
|
||||
int local_has_any = 0;
|
||||
for (int j = threadIdx.x; j < cols_per_block; j += warp_size) {
|
||||
int slot = -1;
|
||||
for (int k = 0; k < nchannels_dst; ++k) {
|
||||
const int idv = ids[j*stride_row_id + k*stride_col_id];
|
||||
if (idv == expert_idx) {
|
||||
slot = k;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j < cols_per_block) {
|
||||
local_has_any |= (slot >= 0);
|
||||
slot_map[j] = slot;
|
||||
}
|
||||
}
|
||||
has_any = warp_reduce_any(local_has_any);
|
||||
}
|
||||
__syncthreads();
|
||||
if (has_any == 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
|
||||
tile_A A[ntA][warp_size / tile_A::J];
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < tile_A::I; ++i) {
|
||||
tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
|
||||
load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
if constexpr (!has_ids) {
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
|
||||
} else {
|
||||
float val = 0.0f;
|
||||
if (j < cols_per_block) {
|
||||
const int slot = slot_map[j];
|
||||
if (slot >= 0) {
|
||||
val = y[slot*stride_channel_y + j*stride_col_y + col];
|
||||
}
|
||||
}
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = val;
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||
const int j = j0 + itB*tile_B::I;
|
||||
|
||||
if constexpr (!has_ids) {
|
||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
} else {
|
||||
float2 tmp = make_float2(0.0f, 0.0f);
|
||||
if (j < cols_per_block) {
|
||||
const int slot = slot_map[j];
|
||||
if (slot >= 0) {
|
||||
const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y);
|
||||
tmp = y2_slot[j*stride_col_y + col];
|
||||
}
|
||||
}
|
||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<T, void>, "unsupported type");
|
||||
}
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
|
||||
tile_B B;
|
||||
load_ldmatrix(B, tile_xy + k0, tile_k_padded);
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
mma(C[itA][itB], A[itA][k0/tile_B::J], B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float * buf_iw = (float *) compute_base;
|
||||
constexpr int kiw = nwarps*rows_per_block + 4;
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
#pragma unroll
|
||||
for (int itB = 0; itB < ntB; ++itB) {
|
||||
#pragma unroll
|
||||
for (int itA = 0; itA < ntA; ++itA) {
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
|
||||
const int j = itB*tile_C::J + tile_C::get_j(l);
|
||||
buf_iw[j*kiw + i] = C[itA][itB].x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nwarps > 1) {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
|
||||
return;
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
static_assert(rows_per_block == warp_size, "need loop/check");
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum += buf_iw[j*kiw + i];
|
||||
}
|
||||
|
||||
if constexpr (!has_ids) {
|
||||
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
} else {
|
||||
const int slot = (j < cols_per_block) ? slot_map[j] : -1;
|
||||
if (slot >= 0) {
|
||||
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(x, y, ids, dst,
|
||||
ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
template<typename T, int cols_per_block, int nwarps>
|
||||
static inline void mul_mat_f_switch_ids(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nchannels_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int64_t stride_row_id,
|
||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
|
||||
if (ids) {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} else {
|
||||
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
|
||||
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int cols_per_block>
|
||||
void mul_mat_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int64_t stride_row_id,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
typedef tile<16, 8, T> tile_A;
|
||||
typedef tile< 8, 8, T> tile_B;
|
||||
|
||||
GGML_ASSERT(ncols_x % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(stride_col_y % 2 == 0);
|
||||
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const int64_t channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int64_t sample_ratio = nsamples_dst / nsamples_x;
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
|
||||
int64_t nwarps_best = 1;
|
||||
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
|
||||
int64_t max_block_size = 256;
|
||||
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
|
||||
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
|
||||
if (niter < niter_best) {
|
||||
niter_best = niter;
|
||||
nwarps_best = nwarps;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
||||
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
|
||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
|
||||
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
||||
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
|
||||
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
|
||||
const int64_t grid_y = ids ? nchannels_x : nchannels_dst; // per expert when ids present
|
||||
|
||||
const dim3 block_nums(nrows_x/rows_per_block, grid_y, nsamples_dst);
|
||||
const dim3 block_dims(warp_size, nwarps_best, 1);
|
||||
|
||||
switch (nwarps_best) {
|
||||
case 1: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 1>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 2>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 3>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 4>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 5>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 6>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 7>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_switch_ids<T, cols_per_block, 8>(
|
||||
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
|
||||
GGML_UNUSED_VARS(nchannels_y);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void mul_mat_f_switch_cols_per_block(
|
||||
const T * x, const float * y, const int32_t * ids, float * dst,
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const int64_t stride_col_id, const int stride_row_id,
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 9: {
|
||||
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 10: {
|
||||
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 11: {
|
||||
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 12: {
|
||||
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 13: {
|
||||
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 14: {
|
||||
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 15: {
|
||||
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
case 16: {
|
||||
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
|
||||
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \
|
||||
template void mul_mat_f_cuda<T, ncols_dst>( \
|
||||
const T * x, const float * y, const int32_t * ids, float * dst, \
|
||||
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
|
||||
const int64_t stride_col_id, const int64_t stride_row_id, \
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, \
|
||||
cudaStream_t stream);
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
#define DECL_MMF_CASE_EXTERN(ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(float, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(half2, ncols_dst) \
|
||||
extern DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
|
||||
|
||||
#define DECL_MMF_CASE(ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(float, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(half2, ncols_dst) \
|
||||
DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
|
||||
|
||||
DECL_MMF_CASE_EXTERN(1);
|
||||
DECL_MMF_CASE_EXTERN(2);
|
||||
DECL_MMF_CASE_EXTERN(3);
|
||||
DECL_MMF_CASE_EXTERN(4);
|
||||
DECL_MMF_CASE_EXTERN(5);
|
||||
DECL_MMF_CASE_EXTERN(6);
|
||||
DECL_MMF_CASE_EXTERN(7);
|
||||
DECL_MMF_CASE_EXTERN(8);
|
||||
DECL_MMF_CASE_EXTERN(9);
|
||||
DECL_MMF_CASE_EXTERN(10);
|
||||
DECL_MMF_CASE_EXTERN(11);
|
||||
DECL_MMF_CASE_EXTERN(12);
|
||||
DECL_MMF_CASE_EXTERN(13);
|
||||
DECL_MMF_CASE_EXTERN(14);
|
||||
DECL_MMF_CASE_EXTERN(15);
|
||||
DECL_MMF_CASE_EXTERN(16);
|
||||
#else
|
||||
#define DECL_MMF_CASE(ncols_dst)
|
||||
#endif
|
||||
|
||||
@@ -24,7 +24,7 @@ TYPES_MMQ = [
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
||||
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4"
|
||||
]
|
||||
|
||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
@@ -34,6 +34,13 @@ SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do
|
||||
DECL_MMQ_CASE({type});
|
||||
"""
|
||||
|
||||
SOURCE_MMF = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE({type});
|
||||
"""
|
||||
|
||||
|
||||
def get_short_name(long_quant_name):
|
||||
return long_quant_name.replace("GGML_TYPE_", "").lower()
|
||||
@@ -76,3 +83,7 @@ for ncols in [8, 16, 32, 64]:
|
||||
for type in TYPES_MMQ:
|
||||
with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
|
||||
f.write(SOURCE_MMQ.format(type=type))
|
||||
|
||||
for type in range(1, 17):
|
||||
with open(f"mmf-instance-ncols_{type}.cu", "w") as f:
|
||||
f.write(SOURCE_MMF.format(type=type))
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(1);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(10);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(11);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(12);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(13);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(14);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(15);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(16);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(2);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(3);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(4);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(5);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(6);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(7);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(8);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmf.cuh"
|
||||
|
||||
DECL_MMF_CASE(9);
|
||||
@@ -20,8 +20,8 @@
|
||||
#define N_R0_Q5_1 4
|
||||
#define N_SG_Q5_1 2
|
||||
|
||||
#define N_R0_Q8_0 4
|
||||
#define N_SG_Q8_0 2
|
||||
#define N_R0_Q8_0 2
|
||||
#define N_SG_Q8_0 4
|
||||
|
||||
#define N_R0_MXFP4 2
|
||||
#define N_SG_MXFP4 2
|
||||
@@ -68,6 +68,11 @@
|
||||
#define N_R0_IQ4_XS 2
|
||||
#define N_SG_IQ4_XS 2
|
||||
|
||||
// function constants offsets
|
||||
#define FC_FLASH_ATTN_EXT 100
|
||||
#define FC_FLASH_ATTN_EXT_VEC 200
|
||||
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 300
|
||||
|
||||
// kernel argument structs
|
||||
//
|
||||
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
|
||||
@@ -236,9 +241,11 @@ typedef struct {
|
||||
int32_t ne11;
|
||||
int32_t ne_12_2; // assume K and V are same shape
|
||||
int32_t ne_12_3;
|
||||
int32_t ns10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int32_t ns20;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb23;
|
||||
@@ -258,10 +265,43 @@ typedef struct {
|
||||
float logit_softcap;
|
||||
} ggml_metal_kargs_flash_attn_ext;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne01;
|
||||
int32_t ne02;
|
||||
int32_t ne03;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int32_t ne11;
|
||||
int32_t ne_12_2; // assume K and V are same shape
|
||||
int32_t ne_12_3;
|
||||
int32_t ns10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int32_t ns20;
|
||||
uint64_t nb21;
|
||||
uint64_t nb22;
|
||||
uint64_t nb23;
|
||||
int32_t ne32;
|
||||
int32_t ne33;
|
||||
uint64_t nb31;
|
||||
uint64_t nb32;
|
||||
uint64_t nb33;
|
||||
int32_t ne1;
|
||||
int32_t ne2;
|
||||
int32_t ne3;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
int32_t n_head_log2;
|
||||
float logit_softcap;
|
||||
} ggml_metal_kargs_flash_attn_ext_vec;
|
||||
|
||||
typedef struct {
|
||||
int32_t nrows;
|
||||
int32_t ne20;
|
||||
} ggml_metal_kargs_flash_attn_ext_reduce;
|
||||
} ggml_metal_kargs_flash_attn_ext_vec_reduce;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
|
||||
+428
-665
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2838,6 +2838,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
||||
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_opencl_init(void) {
|
||||
|
||||
@@ -795,6 +795,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
|
||||
@@ -4063,6 +4063,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||
/* .event_record = */ ggml_backend_sycl_event_record,
|
||||
/* .event_wait = */ ggml_backend_sycl_event_wait,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_sycl_guid() {
|
||||
|
||||
@@ -506,8 +506,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_roll_f32;
|
||||
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_set_rows[GGML_TYPE_COUNT];
|
||||
@@ -554,6 +554,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_argmax_f32;
|
||||
vk_pipeline pipeline_count_equal_i32;
|
||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
@@ -582,6 +583,7 @@ struct vk_device_struct {
|
||||
bool disable_fusion;
|
||||
bool disable_host_visible_vidmem;
|
||||
bool allow_sysmem_fallback;
|
||||
bool disable_optimize_graph;
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||
@@ -803,6 +805,57 @@ static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_ten
|
||||
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
||||
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
||||
|
||||
return p; // offsets are initialized later in ggml_vk_op
|
||||
}
|
||||
|
||||
struct vk_op_pad_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
||||
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
|
||||
uint32_t misalign_offsets;
|
||||
|
||||
uint32_t lp0; uint32_t rp0;
|
||||
uint32_t lp1; uint32_t rp1;
|
||||
uint32_t lp2; uint32_t rp2;
|
||||
uint32_t lp3; uint32_t rp3;
|
||||
};
|
||||
|
||||
static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) {
|
||||
int64_t ne = ggml_nelements(dst);
|
||||
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
|
||||
|
||||
vk_op_pad_push_constants p{};
|
||||
p.ne = (uint32_t)ne;
|
||||
|
||||
size_t src0_tsize = ggml_type_size(src0->type);
|
||||
p.ne00 = (uint32_t)src0->ne[0];
|
||||
p.ne01 = (uint32_t)src0->ne[1];
|
||||
p.ne02 = (uint32_t)src0->ne[2];
|
||||
p.ne03 = (uint32_t)src0->ne[3];
|
||||
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
|
||||
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
|
||||
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
|
||||
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
|
||||
|
||||
size_t dst_tsize = ggml_type_size(dst->type);
|
||||
p.ne10 = (uint32_t)dst->ne[0];
|
||||
p.ne11 = (uint32_t)dst->ne[1];
|
||||
p.ne12 = (uint32_t)dst->ne[2];
|
||||
p.ne13 = (uint32_t)dst->ne[3];
|
||||
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
|
||||
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
|
||||
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
||||
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
||||
|
||||
p.lp0 = dst->op_params[0];
|
||||
p.rp0 = dst->op_params[1];
|
||||
p.lp1 = dst->op_params[2];
|
||||
p.rp1 = dst->op_params[3];
|
||||
p.lp2 = dst->op_params[4];
|
||||
p.rp2 = dst->op_params[5];
|
||||
p.lp3 = dst->op_params[6];
|
||||
p.rp3 = dst->op_params[7];
|
||||
|
||||
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
|
||||
}
|
||||
|
||||
@@ -931,6 +984,37 @@ struct vk_op_im2col_push_constants {
|
||||
int32_t d0; int32_t d1;
|
||||
};
|
||||
|
||||
struct vk_op_im2col_3d_push_constants {
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
uint32_t s2;
|
||||
uint32_t p0;
|
||||
uint32_t p1;
|
||||
uint32_t p2;
|
||||
uint32_t d0;
|
||||
uint32_t d1;
|
||||
uint32_t d2;
|
||||
uint32_t IW;
|
||||
uint32_t IH;
|
||||
uint32_t ID;
|
||||
uint32_t IC;
|
||||
uint32_t KW;
|
||||
uint32_t OH;
|
||||
uint32_t KD_KH_KW;
|
||||
uint32_t KH_KW;
|
||||
uint32_t IC_KD_KH_KW;
|
||||
uint32_t N_OD_OH;
|
||||
uint32_t OD_OH;
|
||||
uint32_t OD_OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OW_IC_KD_KH_KW;
|
||||
uint32_t misalign_offsets;
|
||||
};
|
||||
|
||||
struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t nb1;
|
||||
uint32_t dim;
|
||||
@@ -3143,12 +3227,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
|
||||
@@ -3250,7 +3338,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -3329,10 +3417,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32_len, im2col_3d_f32_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
|
||||
if (device->float_controls_rte_fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_rte_len, im2col_3d_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_len, im2col_3d_f32_f16_data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
@@ -3502,6 +3593,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK");
|
||||
device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr;
|
||||
|
||||
const char* GGML_VK_DISABLE_OPTIMIZE_GRAPH = getenv("GGML_VK_DISABLE_OPTIMIZE_GRAPH");
|
||||
device->disable_optimize_graph = GGML_VK_DISABLE_OPTIMIZE_GRAPH != nullptr;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
bool maintenance4_support = false;
|
||||
@@ -3642,6 +3736,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||
#ifdef __APPLE__
|
||||
// Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846)
|
||||
if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||
device->subgroup_arithmetic = false;
|
||||
}
|
||||
#endif
|
||||
device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle);
|
||||
device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
@@ -5607,6 +5707,20 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_cpy_f32_bf16;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_i32;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_f32_i32;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_I32 && to == GGML_TYPE_F32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_i32_f32;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_i32_f32;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F32) {
|
||||
switch (to) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -7666,6 +7780,14 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_im2col_f32_f16;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_IM2COL_3D:
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_im2col_3d_f32;
|
||||
}
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_im2col_3d_f32_f16;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_timestep_embedding_f32;
|
||||
@@ -7781,6 +7903,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
@@ -7829,6 +7952,26 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
p.misalign_offsets = (a_offset << 16) | d_offset;
|
||||
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
||||
@@ -8069,6 +8212,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
|
||||
elements = { OW * KW * KH, OH, batch * IC };
|
||||
} break;
|
||||
case GGML_OP_IM2COL_3D:
|
||||
{
|
||||
const uint32_t IC = ((const uint32_t *)(dst->op_params))[9];
|
||||
|
||||
const uint32_t N = ne13 / IC;
|
||||
|
||||
const uint32_t KD = ne02;
|
||||
const uint32_t KH = ne01;
|
||||
const uint32_t KW = ne00;
|
||||
|
||||
const uint32_t OD = ned3 / N;
|
||||
const uint32_t OH = ned2;
|
||||
const uint32_t OW = ned1;
|
||||
|
||||
const uint32_t IC_KD_KH_KW = IC*KD*KH*KW;
|
||||
const uint32_t N_OD_OH = N*OD*OH;
|
||||
|
||||
elements = { IC_KD_KH_KW, OW, N_OD_OH };
|
||||
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
} break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
{
|
||||
const uint32_t dim = dst->op_params[0];
|
||||
@@ -8225,7 +8388,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
}
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
} else if (op == GGML_OP_IM2COL) {
|
||||
} else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) {
|
||||
// im2col uses only src1 and dst buffers
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
} else if (op == GGML_OP_COUNT_EQUAL) {
|
||||
@@ -8771,7 +8934,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, con
|
||||
}
|
||||
|
||||
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
||||
vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst);
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
|
||||
}
|
||||
|
||||
@@ -9086,6 +9249,66 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
||||
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
||||
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
||||
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
||||
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
||||
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
||||
|
||||
const int64_t N = ne13 / IC;
|
||||
const int64_t ID = ne12;
|
||||
const int64_t IH = ne11;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
const int64_t KD = ne02;
|
||||
const int64_t KH = ne01;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OD = ne3 / N;
|
||||
const int64_t OH = ne2;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
vk_op_im2col_3d_push_constants pc {};
|
||||
|
||||
pc.nb10 = nb10 / ggml_type_size(src1->type);
|
||||
pc.nb11 = nb11 / ggml_type_size(src1->type);
|
||||
pc.nb12 = nb12 / ggml_type_size(src1->type);
|
||||
pc.nb13 = nb13 / ggml_type_size(src1->type);
|
||||
pc.s0 = s0;
|
||||
pc.s1 = s1;
|
||||
pc.s2 = s2;
|
||||
pc.p0 = p0;
|
||||
pc.p1 = p1;
|
||||
pc.p2 = p2;
|
||||
pc.d0 = d0;
|
||||
pc.d1 = d1;
|
||||
pc.d2 = d2;
|
||||
pc.IW = IW;
|
||||
pc.IH = IH;
|
||||
pc.ID = ID;
|
||||
pc.IC = IC;
|
||||
pc.KW = KW;
|
||||
pc.OH = OH;
|
||||
pc.KD_KH_KW = KD*KH*KW;
|
||||
pc.KH_KW = KH*KW;
|
||||
pc.IC_KD_KH_KW = IC*KD*KH*KW;
|
||||
pc.N_OD_OH = N*OD*OH;
|
||||
pc.OD_OH = OD*OH;
|
||||
pc.OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW;
|
||||
pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
|
||||
pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
|
||||
|
||||
ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t dim = dst->op_params[0];
|
||||
const uint32_t max_period = dst->op_params[1];
|
||||
@@ -10291,6 +10514,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
@@ -10361,6 +10585,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
@@ -10656,6 +10881,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_IM2COL:
|
||||
ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_IM2COL_3D:
|
||||
ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
||||
@@ -10807,6 +11036,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
@@ -11633,6 +11863,131 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
// Sort the graph for improved parallelism.
|
||||
static void ggml_vk_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * graph)
|
||||
{
|
||||
VK_LOG_DEBUG("ggml_vk_optimize_graph(" << graph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
if (ctx->device->disable_optimize_graph) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto const &is_empty = [](ggml_tensor * node) -> bool {
|
||||
return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
||||
};
|
||||
|
||||
auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool {
|
||||
for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
|
||||
if (dst->src[s] == src) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// implicit dependency if they view the same tensor
|
||||
const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst;
|
||||
const ggml_tensor *src2 = src->view_src ? src->view_src : src;
|
||||
if (dst2 == src2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// This function tries to reorder the graph to allow nodes to run in parallel.
|
||||
// This helps with small batches, but for large batches its a slowdown, probably
|
||||
// due to cache contention. So only reorder if the majority of nodes have few rows.
|
||||
int num_small_nodes = 0;
|
||||
int num_counted_nodes = 0;
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
if (!is_empty(graph->nodes[i]) &&
|
||||
graph->nodes[i]->op != GGML_OP_SET_ROWS) {
|
||||
if (ggml_nrows(graph->nodes[i]) <= 8) {
|
||||
num_small_nodes++;
|
||||
}
|
||||
num_counted_nodes++;
|
||||
}
|
||||
}
|
||||
if (num_small_nodes < num_counted_nodes / 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor *> new_order;
|
||||
std::vector<bool> used(graph->n_nodes, false);
|
||||
int first_unused = 0;
|
||||
while (first_unused < graph->n_nodes) {
|
||||
std::vector<int> current_set;
|
||||
|
||||
// First, grab the next unused node.
|
||||
current_set.push_back(first_unused);
|
||||
|
||||
// Loop through the next N nodes. Grab any that don't depend on other nodes that
|
||||
// haven't already been run. Nodes that have already been run have used[i] set
|
||||
// to true. Allow nodes that depend on the previous node if it's a fusion pattern
|
||||
// that we support (e.g. RMS_NORM + MUL).
|
||||
// This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes.
|
||||
// The goal is to not interleave real and view nodes in a way that breaks fusion.
|
||||
const int NUM_TO_CHECK = 20;
|
||||
for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) {
|
||||
if (used[j]) {
|
||||
continue;
|
||||
}
|
||||
if (is_empty(graph->nodes[j])) {
|
||||
continue;
|
||||
}
|
||||
bool ok = true;
|
||||
for (int c = first_unused; c < j; ++c) {
|
||||
if (!used[c] &&
|
||||
is_src_of(graph->nodes[j], graph->nodes[c]) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok) {
|
||||
current_set.push_back(j);
|
||||
}
|
||||
}
|
||||
// Second pass grabs view nodes.
|
||||
// Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add).
|
||||
if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) {
|
||||
for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) {
|
||||
if (used[j]) {
|
||||
continue;
|
||||
}
|
||||
if (!is_empty(graph->nodes[j])) {
|
||||
continue;
|
||||
}
|
||||
bool ok = true;
|
||||
for (int c = first_unused; c < j; ++c) {
|
||||
bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end();
|
||||
// skip views whose srcs haven't been processed.
|
||||
if (!used[c] &&
|
||||
is_src_of(graph->nodes[j], graph->nodes[c]) &&
|
||||
!c_in_current_set) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok) {
|
||||
current_set.push_back(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Push the current set into new_order
|
||||
for (auto c : current_set) {
|
||||
new_order.push_back(graph->nodes[c]);
|
||||
used[c] = true;
|
||||
}
|
||||
while (first_unused < graph->n_nodes && used[first_unused]) {
|
||||
first_unused++;
|
||||
}
|
||||
}
|
||||
// Replace the graph with the new order.
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
graph->nodes[i] = new_order[i];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
@@ -11648,6 +12003,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ ggml_vk_optimize_graph,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
@@ -12022,6 +12378,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32 ||
|
||||
src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// We can handle copying from a type to the same type if it's
|
||||
// contiguous (memcpy). We use f16 or f32 shaders to do the copy,
|
||||
// so the type/block size must be a multiple of 4.
|
||||
@@ -12076,10 +12439,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_SCALE:
|
||||
return true;
|
||||
case GGML_OP_PAD:
|
||||
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
|
||||
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
||||
case GGML_OP_ROLL:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
@@ -12092,6 +12452,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_POOL_2D:
|
||||
@@ -12520,7 +12881,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_PAD) {
|
||||
tensor_clone = ggml_pad(ggml_ctx, src_clone[0], tensor->ne[0] - src_clone[0]->ne[0], tensor->ne[1] - src_clone[0]->ne[1], tensor->ne[2] - src_clone[0]->ne[2], tensor->ne[3] - src_clone[0]->ne[3]);
|
||||
tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
|
||||
tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
|
||||
} else if (tensor->op == GGML_OP_REPEAT) {
|
||||
tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
|
||||
} else if (tensor->op == GGML_OP_REPEAT_BACK) {
|
||||
@@ -12666,6 +13028,19 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
|
||||
const bool is_2D = tensor->op_params[6] == 1;
|
||||
tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_IM2COL_3D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s1 = tensor->op_params[2];
|
||||
const int32_t p0 = tensor->op_params[3];
|
||||
const int32_t p1 = tensor->op_params[4];
|
||||
const int32_t p1 = tensor->op_params[5];
|
||||
const int32_t d0 = tensor->op_params[6];
|
||||
const int32_t d1 = tensor->op_params[7];
|
||||
const int32_t d1 = tensor->op_params[8];
|
||||
const int32_t IC = tensor->op_params[9];
|
||||
|
||||
tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
|
||||
#include "rte.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t s0;
|
||||
uint32_t s1;
|
||||
uint32_t s2;
|
||||
uint32_t p0;
|
||||
uint32_t p1;
|
||||
uint32_t p2;
|
||||
uint32_t d0;
|
||||
uint32_t d1;
|
||||
uint32_t d2;
|
||||
uint32_t IW;
|
||||
uint32_t IH;
|
||||
uint32_t ID;
|
||||
uint32_t IC;
|
||||
uint32_t KW;
|
||||
uint32_t OH;
|
||||
uint32_t KD_KH_KW;
|
||||
uint32_t KH_KW;
|
||||
uint32_t IC_KD_KH_KW;
|
||||
uint32_t N_OD_OH;
|
||||
uint32_t OD_OH;
|
||||
uint32_t OD_OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OW_IC_KD_KH_KW;
|
||||
uint32_t misalign_offsets;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint32_t i = gl_GlobalInvocationID.x;
|
||||
|
||||
uint32_t nb10 = p.nb10;
|
||||
uint32_t nb11 = p.nb11;
|
||||
uint32_t nb12 = p.nb12;
|
||||
uint32_t nb13 = p.nb13;
|
||||
uint32_t s0 = p.s0;
|
||||
uint32_t s1 = p.s1;
|
||||
uint32_t s2 = p.s2;
|
||||
uint32_t p0 = p.p0;
|
||||
uint32_t p1 = p.p1;
|
||||
uint32_t p2 = p.p2;
|
||||
uint32_t d0 = p.d0;
|
||||
uint32_t d1 = p.d1;
|
||||
uint32_t d2 = p.d2;
|
||||
uint32_t IW = p.IW;
|
||||
uint32_t IH = p.IH;
|
||||
uint32_t ID = p.ID;
|
||||
uint32_t IC = p.IC;
|
||||
uint32_t KW = p.KW;
|
||||
uint32_t OH = p.OH;
|
||||
uint32_t KD_KH_KW = p.KD_KH_KW;
|
||||
uint32_t KH_KW = p.KH_KW;
|
||||
uint32_t IC_KD_KH_KW = p.IC_KD_KH_KW;
|
||||
uint32_t N_OD_OH = p.N_OD_OH;
|
||||
uint32_t OD_OH = p.OD_OH;
|
||||
uint32_t OD_OH_OW_IC_KD_KH_KW = p.OD_OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OH_OW_IC_KD_KH_KW = p.OH_OW_IC_KD_KH_KW;
|
||||
uint32_t OW_IC_KD_KH_KW = p.OW_IC_KD_KH_KW;
|
||||
|
||||
if (i >= IC_KD_KH_KW) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t iic = i / KD_KH_KW;
|
||||
const uint32_t ikd = (i - iic * KD_KH_KW) / KH_KW;
|
||||
const uint32_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
|
||||
const uint32_t ikw = i % KW;
|
||||
|
||||
const uint32_t iow = gl_GlobalInvocationID.y;
|
||||
for (uint32_t iz = gl_GlobalInvocationID.z; iz < N_OD_OH; iz += gl_NumWorkGroups.z) {
|
||||
const uint32_t in_ = iz / OD_OH;
|
||||
const uint32_t iod = (iz - in_*OD_OH) / OH;
|
||||
const uint32_t ioh = iz % OH;
|
||||
|
||||
const uint32_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const uint32_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
const uint32_t iid = iod * s2 + ikd * d2 - p2;
|
||||
|
||||
const uint32_t offset_dst = in_*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
|
||||
|
||||
if (iih >= IH || iiw >= IW || iid >= ID) {
|
||||
data_d[offset_dst + get_doffset()] = D_TYPE(0.0f);
|
||||
} else {
|
||||
const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
|
||||
data_d[offset_dst + get_doffset()] = D_TYPE(data_a[offset_src + get_aoffset()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -315,21 +315,23 @@ void main() {
|
||||
#if LOAD_VEC_A == 8
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx][0].w);
|
||||
buf_a[buf_idx + 4] = FLOAT_TYPE(data_a[idx][1].x);
|
||||
buf_a[buf_idx + 5] = FLOAT_TYPE(data_a[idx][1].y);
|
||||
buf_a[buf_idx + 6] = FLOAT_TYPE(data_a[idx][1].z);
|
||||
buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w);
|
||||
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(aa[0].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(aa[0].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(aa[0].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(aa[0].w);
|
||||
buf_a[buf_idx + 4] = FLOAT_TYPE(aa[1].x);
|
||||
buf_a[buf_idx + 5] = FLOAT_TYPE(aa[1].y);
|
||||
buf_a[buf_idx + 6] = FLOAT_TYPE(aa[1].z);
|
||||
buf_a[buf_idx + 7] = FLOAT_TYPE(aa[1].w);
|
||||
#elif LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w);
|
||||
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(aa.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(aa.y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(aa.z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(aa.w);
|
||||
#else
|
||||
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
||||
@@ -808,14 +810,19 @@ void main() {
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx][0].w);
|
||||
buf_b[buf_idx + 4] = FLOAT_TYPE(data_b[idx][1].x);
|
||||
buf_b[buf_idx + 5] = FLOAT_TYPE(data_b[idx][1].y);
|
||||
buf_b[buf_idx + 6] = FLOAT_TYPE(data_b[idx][1].z);
|
||||
buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w);
|
||||
#if defined(DATA_B_BF16)
|
||||
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||
#else
|
||||
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(bb[0].x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(bb[0].y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(bb[0].z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(bb[0].w);
|
||||
buf_b[buf_idx + 4] = FLOAT_TYPE(bb[1].x);
|
||||
buf_b[buf_idx + 5] = FLOAT_TYPE(bb[1].y);
|
||||
buf_b[buf_idx + 6] = FLOAT_TYPE(bb[1].z);
|
||||
buf_b[buf_idx + 7] = FLOAT_TYPE(bb[1].w);
|
||||
#elif LOAD_VEC_B == 4
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[loadc_b + l];
|
||||
@@ -824,10 +831,15 @@ void main() {
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
buf_b[buf_idx + 0] = TO_FLOAT_TYPE(data_b[idx].x);
|
||||
buf_b[buf_idx + 1] = TO_FLOAT_TYPE(data_b[idx].y);
|
||||
buf_b[buf_idx + 2] = TO_FLOAT_TYPE(data_b[idx].z);
|
||||
buf_b[buf_idx + 3] = TO_FLOAT_TYPE(data_b[idx].w);
|
||||
#if defined(DATA_B_BF16)
|
||||
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||
#else
|
||||
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(bb.x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(bb.y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(bb.z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(bb.w);
|
||||
#elif !MUL_MAT_ID
|
||||
if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
|
||||
|
||||
@@ -1,7 +1,25 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint misalign_offsets;
|
||||
|
||||
uint lp0; uint rp0;
|
||||
uint lp1; uint rp1;
|
||||
uint lp2; uint rp2;
|
||||
uint lp3; uint rp3;
|
||||
} p;
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
@@ -19,10 +37,13 @@ void main() {
|
||||
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
||||
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
||||
|
||||
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||
const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
|
||||
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
|
||||
i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
|
||||
i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
|
||||
i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
|
||||
|
||||
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
|
||||
}
|
||||
|
||||
@@ -13,10 +13,13 @@
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float
|
||||
#define A_TYPE32 float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE vec4
|
||||
#define A_TYPE32 vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE mat2x4
|
||||
#define A_TYPE32 mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -26,10 +29,13 @@
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float16_t
|
||||
#define A_TYPE32 float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE f16vec4
|
||||
#define A_TYPE32 vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE f16mat2x4
|
||||
#define A_TYPE32 mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -1424,6 +1430,11 @@ float bf16_to_fp32(uint32_t u)
|
||||
return uintBitsToFloat(u << 16);
|
||||
}
|
||||
|
||||
vec4 bf16_to_fp32(uvec4 u)
|
||||
{
|
||||
return vec4(bf16_to_fp32(u.x), bf16_to_fp32(u.y), bf16_to_fp32(u.z), bf16_to_fp32(u.w));
|
||||
}
|
||||
|
||||
float e8m0_to_fp32(uint8_t x) {
|
||||
uint32_t bits;
|
||||
|
||||
|
||||
@@ -364,11 +364,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
};
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
// bf16
|
||||
{
|
||||
@@ -384,8 +384,8 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
if (!(coopmat || coopmat2))
|
||||
#endif
|
||||
{
|
||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE32", "vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -408,13 +408,13 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
|
||||
// don't generate f32 variants for coopmat2
|
||||
if (!coopmat2) {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
@@ -560,10 +560,14 @@ void process_shaders() {
|
||||
string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_i32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
||||
string_to_spv("contig_cpy_i32_f32", "contig_copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
||||
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
|
||||
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
||||
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
@@ -713,6 +717,10 @@ void process_shaders() {
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
|
||||
|
||||
string_to_spv("im2col_3d_f32", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("im2col_3d_f32_f16", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("im2col_3d_f32_f16_rte", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
|
||||
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
@@ -665,6 +665,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
|
||||
/* .graph_compute = */ ggml_backend_webgpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
/* End GGML Backend Interface */
|
||||
@@ -1154,17 +1155,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
||||
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
auto callback =
|
||||
[](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message, void * userdata) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
*static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
|
||||
};
|
||||
void * userdata = &ctx->adapter;
|
||||
ctx->instance.WaitAny(
|
||||
ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
|
||||
ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->adapter = std::move(adapter);
|
||||
}), UINT64_MAX);
|
||||
GGML_ASSERT(ctx->adapter != nullptr);
|
||||
|
||||
ctx->adapter.GetLimits(&ctx->limits);
|
||||
|
||||
@@ -586,6 +586,7 @@ static ggml_backend_i ggml_backend_zdnn_i = {
|
||||
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .optimize_graph = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_zdnn_guid(void) {
|
||||
|
||||
@@ -3623,6 +3623,7 @@ struct ggml_tensor * ggml_get_rows(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
GGML_ASSERT(a->ne[2] == b->ne[1]);
|
||||
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
||||
GGML_ASSERT(b->ne[3] == 1);
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
|
||||
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
@@ -0,0 +1,87 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
id="Layer_1"
|
||||
version="1.1"
|
||||
viewBox="0 0 250 250"
|
||||
sodipodi:docname="llama-icon.svg"
|
||||
width="250"
|
||||
height="250"
|
||||
inkscape:version="1.4.2 (ebf0e940d0, 2025-05-08)"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg">
|
||||
<sodipodi:namedview
|
||||
id="namedview7"
|
||||
pagecolor="#505050"
|
||||
bordercolor="#ffffff"
|
||||
borderopacity="1"
|
||||
inkscape:showpageshadow="0"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pagecheckerboard="1"
|
||||
inkscape:deskcolor="#505050"
|
||||
inkscape:zoom="2.48"
|
||||
inkscape:cx="146.57258"
|
||||
inkscape:cy="189.91936"
|
||||
inkscape:window-width="3440"
|
||||
inkscape:window-height="1440"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="g7" />
|
||||
<!-- Generator: Adobe Illustrator 29.3.1, SVG Export Plug-In . SVG Version: 2.1.0 Build 151) -->
|
||||
<defs
|
||||
id="defs1">
|
||||
<style
|
||||
id="style1">
|
||||
.st0 {
|
||||
fill: #ff8236;
|
||||
}
|
||||
|
||||
.st1 {
|
||||
fill: #fff;
|
||||
}
|
||||
|
||||
.st2 {
|
||||
fill: #1b1f20;
|
||||
}
|
||||
</style>
|
||||
</defs>
|
||||
<rect
|
||||
class="st2"
|
||||
width="250"
|
||||
height="250"
|
||||
rx="8.6857386"
|
||||
ry="8.7008333"
|
||||
id="rect1"
|
||||
x="0"
|
||||
y="0"
|
||||
style="stroke-width:0.266071" />
|
||||
<g
|
||||
id="g7">
|
||||
<g
|
||||
id="g6"
|
||||
transform="translate(-995.51066,-129.70875)">
|
||||
<path
|
||||
class="st0"
|
||||
d="m 1163.3,226.8 -13.5,24 c -17.8,-13.7 -44.2,-15.7 -62,-1 -28.7,23.7 -26.7,78.5 18,78.8 12.5,0 23.1,-5.9 34.5,-9.8 l 6,23.9 c -10.1,4.7 -20.4,9.5 -31.5,11 -101.2,13.8 -95.4,-132.3 -3.9,-139.9 19.2,-1.6 36.1,3.4 52.5,13 z"
|
||||
id="path4" />
|
||||
<path
|
||||
class="st0"
|
||||
d="m 1093.4,203.8 c -15.4,4.6 -29.7,13.1 -40.5,25 -2,-24.2 3.4,-73.1 30.3,-82.7 4,-1.4 17.7,-4.9 17.3,2.2 -0.4,7.1 -9.9,19.3 -12.2,25.9 -4,11.6 -0.3,19.6 5.2,29.7 z"
|
||||
id="path5" />
|
||||
<polygon
|
||||
class="st0"
|
||||
points="1131.4,307.8 1116.4,307.8 1116.4,290.8 1099.4,290.8 1099.4,276.8 1114.9,276.8 1116.4,275.3 1116.4,258.8 1131.4,258.8 1131.4,276.8 1147.4,276.8 1147.4,290.8 1131.4,290.8 "
|
||||
id="polygon5" />
|
||||
<polygon
|
||||
class="st0"
|
||||
points="1186.4,290.8 1186.4,307.8 1171.4,307.8 1171.4,290.8 1155.4,290.8 1155.4,276.8 1171.4,276.8 1171.4,258.8 1186.4,258.8 1186.4,275.3 1187.9,276.8 1203.4,276.8 1203.4,290.8 "
|
||||
id="polygon6" />
|
||||
<path
|
||||
class="st0"
|
||||
d="m 1142.3,156.9 c 2,3 -9.3,15.9 -11.1,19.2 -5.2,9.8 -1.7,15.4 2.2,24.7 -11.3,-1.7 -21.8,-0.3 -33,1 2.5,-21.5 14.6,-52.8 41.9,-44.9 z"
|
||||
id="path6" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 2.7 KiB |
@@ -22,4 +22,5 @@ These templates can be updated with the following commands:
|
||||
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
|
||||
./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja
|
||||
./scripts/get_chat_template.py zai-org/GLM-4.5 > models/templates/zai-org-GLM-4.5.jinja
|
||||
./scripts/get_chat_template.py deepseek-ai/DeepSeek-V3.1 > models/templates/deepseek-ai-DeepSeek-V3.1.jinja
|
||||
```
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not thinking is defined %}{% set thinking = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '
|
||||
|
||||
' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{%- set ns.is_first = false -%}{%- set ns.is_last_user = true -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}{%- if ns.is_last_user %}{{'<|Assistant|></think>'}}{%- endif %}{%- set ns.is_last_user = false -%}{%- set ns.is_first = false %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- else %}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}{%- if ns.is_last_user %}{{'<|Assistant|>'}}{%- if message['prefix'] is defined and message['prefix'] and thinking %}{{'<think>'}} {%- else %}{{'</think>'}}{%- endif %}{%- endif %}{%- set ns.is_last_user = false -%}{%- if ns.is_tool %}{{message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{%- set content = message['content'] -%}{%- if '</think>' in content %}{%- set content = content.split('</think>', 1)[1] -%}{%- endif %}{{content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_last_user = false -%}{%- set ns.is_tool = true -%}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endfor -%}{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}{{'<|Assistant|>'}}{%- if not thinking %}{{'</think>'}}{%- else %}{{'<think>'}}{%- endif %}{% endif %}
|
||||
@@ -2,7 +2,9 @@ mistral-common>=1.8.3
|
||||
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.4.0; platform_machine != "s390x"
|
||||
|
||||
## Embedding Gemma requires PyTorch 2.6.0 or later
|
||||
torch~=2.6.0; platform_machine != "s390x"
|
||||
|
||||
# torch s390x packages can only be found from nightly builds
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
||||
|
||||
@@ -1,5 +1,14 @@
|
||||
numpy~=1.26.4
|
||||
sentencepiece~=0.2.0
|
||||
transformers>=4.45.1,<5.0.0
|
||||
|
||||
# Embedding Gemma is currently a preview release:
|
||||
# https://github.com/huggingface/transformers/releases/tag/v4.56.0-Embedding-Gemma-preview
|
||||
|
||||
# The version is needed to be able to convert Embedding Gemma models to GGUF format:
|
||||
git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview
|
||||
|
||||
# Once Embedding Gemma is officially released, we can switch to:
|
||||
#transformers>=4.57.1,<5.0.0
|
||||
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0,<5.0.0
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
aiohttp~=3.9.3
|
||||
pytest~=8.3.3
|
||||
huggingface_hub~=0.23.2
|
||||
huggingface_hub>=0.34.0,<1.0
|
||||
matplotlib~=3.10.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
|
||||
@@ -53,7 +53,7 @@ import typer
|
||||
sys.path.insert(0, Path(__file__).parent.parent.as_posix())
|
||||
if True:
|
||||
from tools.server.tests.utils import ServerProcess
|
||||
from tools.server.tests.unit.test_tool_call import TIMEOUT_SERVER_START, do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
from tools.server.tests.unit.test_tool_call import do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
|
||||
|
||||
@contextmanager
|
||||
@@ -335,7 +335,7 @@ def run(
|
||||
# server.debug = True
|
||||
|
||||
with scoped_server(server):
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=15 * 60)
|
||||
for ignore_chat_grammar in [False]:
|
||||
run(
|
||||
server,
|
||||
|
||||
@@ -285,8 +285,8 @@ llama_context::llama_context(
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs
|
||||
n_outputs = 1;
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
|
||||
+2
-1
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
// [TAG_NO_CACHE_PAD]
|
||||
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
||||
assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
||||
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
||||
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
||||
|
||||
ggml_tensor * q = q_cur;
|
||||
ggml_tensor * k = k_cur;
|
||||
|
||||
+56
-16
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
||||
|
||||
const int32_t ikv = map_layer_ids.at(il);
|
||||
|
||||
auto * k = layers[ikv].k;
|
||||
ggml_tensor * k = layers[ikv].k;
|
||||
|
||||
const int64_t n_tokens = k_cur->ne[2];
|
||||
const int64_t n_embd_head = k_cur->ne[0];
|
||||
const int64_t n_head = k_cur->ne[1];
|
||||
const int64_t n_tokens = k_cur->ne[2];
|
||||
|
||||
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
|
||||
const int64_t n_embd_gqa = n_embd_head*n_head;
|
||||
|
||||
if (k->ne[2] > 1) {
|
||||
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
|
||||
// we can merge dims 0 and 1
|
||||
// TODO: add ggml helper function for this?
|
||||
GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
|
||||
|
||||
k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
|
||||
|
||||
const int64_t n_stream = k->ne[2];
|
||||
|
||||
if (n_stream > 1) {
|
||||
const int64_t kv_size = get_size();
|
||||
|
||||
assert(n_embd_gqa == k->ne[0]);
|
||||
assert(kv_size == k->ne[1]);
|
||||
|
||||
// merge the buffer across all streams because the idxs are global
|
||||
k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
|
||||
}
|
||||
|
||||
// store the current K values into the cache
|
||||
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
||||
}
|
||||
|
||||
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
||||
|
||||
auto * v = layers[ikv].v;
|
||||
|
||||
const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
|
||||
const int64_t n_tokens = v_cur->ne[2];
|
||||
const int64_t n_embd_head = v_cur->ne[0];
|
||||
const int64_t n_head = v_cur->ne[1];
|
||||
const int64_t n_tokens = v_cur->ne[2];
|
||||
|
||||
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
|
||||
const int64_t n_embd_gqa = n_embd_head*n_head;
|
||||
|
||||
// we can merge dims 0 and 1
|
||||
GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
|
||||
|
||||
const int64_t n_stream = v->ne[2];
|
||||
|
||||
// take this branch when FA is enabled (the V cache is not transposed)
|
||||
if (!v_trans) {
|
||||
if (v->ne[2] > 1) {
|
||||
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
|
||||
v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
|
||||
|
||||
if (n_stream > 1) {
|
||||
const int64_t kv_size = get_size();
|
||||
|
||||
assert(n_embd_gqa == v->ne[0]);
|
||||
assert(kv_size == v->ne[1]);
|
||||
|
||||
// merge the buffer across all streams because the idxs are global
|
||||
v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
|
||||
}
|
||||
|
||||
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
||||
}
|
||||
|
||||
// [TAG_V_CACHE_VARIABLE]
|
||||
if (n_embd_v_gqa < v->ne[0]) {
|
||||
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
|
||||
if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
|
||||
// we can merge dims 0, 1 and 2
|
||||
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
|
||||
} else {
|
||||
// otherwise -> make a copy to get contiguous data
|
||||
v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
|
||||
}
|
||||
|
||||
// the row becomes a single element
|
||||
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
|
||||
// [TAG_V_CACHE_VARIABLE]
|
||||
if (n_embd_gqa < v->ne[0]) {
|
||||
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
|
||||
}
|
||||
|
||||
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
|
||||
// in this branch the v_idxs are constructed in such a way that each row is a single head element
|
||||
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
|
||||
|
||||
v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
|
||||
|
||||
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
||||
}
|
||||
|
||||
@@ -317,9 +317,17 @@ public:
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
||||
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
||||
// - k_idxs [n_tokens]
|
||||
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
||||
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
||||
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
||||
|
||||
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
||||
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
||||
// helps understand the implementation logic of cpy_k and cpy_v
|
||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
|
||||
|
||||
+36
-92
@@ -6927,9 +6927,7 @@ struct llm_build_falcon : public llm_graph_context {
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_ext(
|
||||
@@ -7207,9 +7205,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
@@ -7329,13 +7325,9 @@ struct llm_build_starcoder : public llm_graph_context {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
@@ -7551,14 +7543,16 @@ struct llm_build_bert : public llm_graph_context {
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
} else {
|
||||
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
|
||||
@@ -7569,8 +7563,6 @@ struct llm_build_bert : public llm_graph_context {
|
||||
LLM_NORM, il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
} else {
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
}
|
||||
|
||||
if (model.layers[il].attn_k_norm) {
|
||||
@@ -7580,8 +7572,6 @@ struct llm_build_bert : public llm_graph_context {
|
||||
LLM_NORM, il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
} else {
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
|
||||
// RoPE
|
||||
@@ -7727,9 +7717,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
// RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
@@ -7836,13 +7824,9 @@ struct llm_build_bloom : public llm_graph_context {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
@@ -7958,13 +7942,9 @@ struct llm_build_mpt : public llm_graph_context {
|
||||
cb(cur, "wqkv_clamped", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
// Q/K Layernorm
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
@@ -7972,26 +7952,16 @@ struct llm_build_mpt : public llm_graph_context {
|
||||
model.layers[il].attn_q_norm,
|
||||
model.layers[il].attn_q_norm_b,
|
||||
LLM_NORM, il);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = build_norm(Kcur,
|
||||
model.layers[il].attn_k_norm,
|
||||
model.layers[il].attn_k_norm_b,
|
||||
LLM_NORM, il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
} else {
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
@@ -8240,11 +8210,9 @@ struct llm_build_qwen : public llm_graph_context {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_ext(
|
||||
@@ -9219,21 +9187,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
} else {
|
||||
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
@@ -9357,21 +9321,17 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
} else {
|
||||
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
@@ -9621,18 +9581,14 @@ struct llm_build_gpt2 : public llm_graph_context {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
@@ -9727,9 +9683,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
@@ -12601,9 +12555,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
@@ -13736,18 +13688,14 @@ struct llm_build_jais : public llm_graph_context {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
||||
@@ -13859,8 +13807,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
}
|
||||
|
||||
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
||||
@@ -13993,8 +13940,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
||||
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
@@ -17293,16 +17239,14 @@ private:
|
||||
const int64_t k_offset = n_embd_head_q * n_head;
|
||||
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
|
||||
+67
-25
@@ -300,6 +300,7 @@ static std::string var_to_str(ggml_scale_mode mode) {
|
||||
#define VARS_TO_STR13(a, b, c, d, e, f, g, h, i, j, k, l, m) VAR_TO_STR(a) + "," + VARS_TO_STR12(b, c, d, e, f, g, h, i, j, k, l, m)
|
||||
#define VARS_TO_STR14(a, b, c, d, e, f, g, h, i, j, k, l, m, n) VAR_TO_STR(a) + "," + VARS_TO_STR13(b, c, d, e, f, g, h, i, j, k, l, m, n)
|
||||
#define VARS_TO_STR15(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) VAR_TO_STR(a) + "," + VARS_TO_STR14(b, c, d, e, f, g, h, i, j, k, l, m, n, o)
|
||||
#define VARS_TO_STR16(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) VAR_TO_STR(a) + "," + VARS_TO_STR15(b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
static bool inline _isinf(float f) {
|
||||
@@ -1956,24 +1957,25 @@ struct test_get_rows : public test_case {
|
||||
const int n; // cols
|
||||
const int m; // rows
|
||||
const int r; // rows to get
|
||||
const int b; // batch size
|
||||
const int be1; // batch size
|
||||
const int be2; // batch size
|
||||
const bool v; // view (non-contiguous src1)
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR6(type, n, m, r, b, v);
|
||||
return VARS_TO_STR7(type, n, m, r, be1, be2, v);
|
||||
}
|
||||
|
||||
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
|
||||
: type(type), n(n), m(m), r(r), b(b), v(v) {}
|
||||
test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int be1 = 1, int be2 = 1, bool v = false)
|
||||
: type(type), n(n), m(m), r(r), be1(be1), be2(be2), v(v) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
|
||||
ggml_tensor * in = ggml_new_tensor_4d(ctx, type, n, m, be1, be2);
|
||||
ggml_set_name(in, "in");
|
||||
|
||||
ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
|
||||
ggml_tensor * rows = ggml_new_tensor_3d(ctx, GGML_TYPE_I32, r, be1, be2);
|
||||
ggml_set_name(rows, "rows");
|
||||
if (v) {
|
||||
rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
|
||||
rows = ggml_view_3d(ctx, rows, r/2, be1, be2, rows->nb[1], rows->nb[2], 0);
|
||||
ggml_set_name(rows, "view_of_rows");
|
||||
}
|
||||
|
||||
@@ -1994,11 +1996,11 @@ struct test_get_rows : public test_case {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) { continue; }
|
||||
// rows
|
||||
std::vector<int> data(r*b);
|
||||
for (int i = 0; i < r*b; i++) {
|
||||
std::vector<int> data(r*be1*be2);
|
||||
for (int i = 0; i < r*be1*be2; i++) {
|
||||
data[i] = rand() % m;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
|
||||
ggml_backend_tensor_set(t, data.data(), 0, r * be1 * be2 * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
@@ -2455,6 +2457,13 @@ struct test_cpy : public test_case {
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
// test extended range of values to check if casting between f32 and i32 is consistent
|
||||
init_tensor_uniform(t, -150.f, 150.f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONT
|
||||
@@ -4047,9 +4056,10 @@ struct test_im2col_3d : public test_case {
|
||||
const int d2;
|
||||
|
||||
const int64_t IC;
|
||||
const bool v;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR15(type_input, type_kernel, dst_type, ne_input, ne_kernel, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
|
||||
return VARS_TO_STR16(type_input, type_kernel, dst_type, ne_input, ne_kernel, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, v);
|
||||
}
|
||||
|
||||
test_im2col_3d(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
||||
@@ -4058,14 +4068,20 @@ struct test_im2col_3d : public test_case {
|
||||
int64_t IC = 3,
|
||||
int s0 = 1, int s1 = 1, int s2 = 1,
|
||||
int p0 = 1, int p1 = 1, int p2 = 1,
|
||||
int d0 = 1, int d1 = 1, int d2 = 1)
|
||||
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), s2(s2), p0(p0), p1(p1), p2(p2), d0(d0), d1(d1), d2(d2), IC(IC) {}
|
||||
int d0 = 1, int d1 = 1, int d2 = 1,
|
||||
bool v = false)
|
||||
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), s2(s2), p0(p0), p1(p1), p2(p2), d0(d0), d1(d1), d2(d2), IC(IC), v(v) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
||||
ggml_set_param(input);
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
if (v) {
|
||||
input = ggml_view_4d(ctx, input, ne_input[0] - 2, ne_input[1] - 2, ne_input[2] - 2, ne_input[3] - 2, input->nb[1], input->nb[2], input->nb[3], 0);
|
||||
ggml_set_name(input, "view_of_input");
|
||||
}
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
@@ -4704,21 +4720,28 @@ struct test_pad_ext : public test_case {
|
||||
const int rp2;
|
||||
const int lp3;
|
||||
const int rp3;
|
||||
const bool v;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR10(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
|
||||
return VARS_TO_STR11(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v);
|
||||
}
|
||||
|
||||
test_pad_ext(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_a = {512, 512, 3, 1},
|
||||
int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1,
|
||||
int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1)
|
||||
: type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3) {}
|
||||
int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1,
|
||||
bool v = false)
|
||||
: type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), v(v) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
if (v) {
|
||||
a = ggml_view_4d(ctx, a, (a->ne[0] + 1) / 2, (a->ne[1] + 1) / 2, (a->ne[2] + 1) / 2, (a->ne[3] + 1) / 2, a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
ggml_set_name(a, "view of a");
|
||||
}
|
||||
|
||||
ggml_tensor * out = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
@@ -5605,17 +5628,23 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_Q4_0}) {
|
||||
test_cases.emplace_back(new test_get_rows(type, 300*256, 5, 4, 1, 2, false));
|
||||
test_cases.emplace_back(new test_get_rows(type, 256, 80000, 70000, 2, 1, false));
|
||||
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, 700, 100, false));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, 1, false));
|
||||
for (ggml_type type : all_types) {
|
||||
for (int b : {1, 7}) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
|
||||
test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, 1, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int b : {1, 7}) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
|
||||
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, 1, v));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5722,9 +5751,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int d0 : {1, 3}) {
|
||||
for (int d1 : {1, 3}) {
|
||||
for (int d2 : {1, 3}) {
|
||||
test_cases.emplace_back(new test_im2col_3d(
|
||||
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 10, 3}, {3, 3, 3, 3},
|
||||
3, s0, s1, s2, p0, p1, p2, d0, d1, d2));
|
||||
for (int IC : {1, 3}) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_im2col_3d(
|
||||
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 10, 3}, {3, 3, 3, 3},
|
||||
IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, v));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5981,6 +6014,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
||||
}
|
||||
}
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
||||
@@ -6224,7 +6261,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int n_mats : {4, 8}) {
|
||||
for (int n_used : {1, 2, 4}) {
|
||||
for (bool b : {false, true}) {
|
||||
for (int n : {1, 32, 129}) {
|
||||
for (int n : {1, 4, 5, 32, 129}) {
|
||||
int m = 512;
|
||||
int k = 256;
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
||||
@@ -6459,8 +6496,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_timestep_embedding());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
for (int hsk : { 40, 64, 80, 128, 192, 256, 576 }) {
|
||||
for (int hsv : { 40, 64, 80, 128, 192, 256, 512 }) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
|
||||
test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, v));
|
||||
}
|
||||
|
||||
for (int hsk : { 40, 64, 80, 96, 128, 192, 256, 576 }) {
|
||||
for (int hsv : { 40, 64, 80, 96, 128, 192, 256, 512 }) {
|
||||
if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
|
||||
if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
|
||||
if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
|
||||
|
||||
+192
-1
@@ -15,14 +15,20 @@
|
||||
#include "regex-partial.h"
|
||||
|
||||
template <class T>
|
||||
static void assert_equals(const T & expected, const T & actual) {
|
||||
static void assert_equals(const std::string_view label, const T & expected, const T & actual) {
|
||||
if (expected != actual) {
|
||||
std::cerr << label << std::endl;
|
||||
std::cerr << "Expected: " << expected << std::endl;
|
||||
std::cerr << "Actual: " << actual << std::endl;
|
||||
std::cerr << std::flush;
|
||||
throw std::runtime_error("Test failed");
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static void assert_equals(const T & expected, const T & actual) {
|
||||
assert_equals("", expected, actual);
|
||||
}
|
||||
static void assert_equals(const char * expected, const std::string & actual) {
|
||||
return assert_equals<std::string>(expected, actual);
|
||||
}
|
||||
@@ -46,6 +52,7 @@ static void assert_throws(const std::function<void()> & fn, const std::string &
|
||||
}
|
||||
|
||||
static void test_reasoning() {
|
||||
//common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
|
||||
{
|
||||
common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
@@ -99,6 +106,36 @@ static void test_reasoning() {
|
||||
assert_equals("<think>Cogito</think>", builder.result().content);
|
||||
assert_equals("Ergo sum", builder.consume_rest());
|
||||
}
|
||||
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("deepseek_v3_1_reasoning_format_deepseek");
|
||||
common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, syntax);
|
||||
assert_equals(variant, true, builder.try_parse_reasoning("<think>", "</think>"));
|
||||
assert_equals(variant, std::string("REASONING"), builder.result().reasoning_content);
|
||||
assert_equals(variant, std::string("ok"), builder.consume_rest());
|
||||
}
|
||||
// Test DeepSeek V3.1 parsing - reasoning_format none - reasoning content followed by "</think>" and then regular content
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("deepseek_v3_1_reasoning_format_none");
|
||||
const std::string input = "REASONING</think>ok";
|
||||
auto msg = common_chat_parse(input, false, syntax);
|
||||
assert_equals(variant, std::string("REASONING</think>ok"), msg.content);
|
||||
assert_equals(variant, std::string(""), msg.reasoning_content);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_regex() {
|
||||
@@ -186,6 +223,159 @@ static void test(const std::string & input, bool is_partial, const std::vector<s
|
||||
assert_equals(is_partial, js->is_partial);
|
||||
assert_equals(expected, args_paths.size() == 1 && args_paths[0].empty() ? js->value.get<std::string>() : js->value.dump());
|
||||
}
|
||||
|
||||
static void test_deepseek_v3_1_tool_calls() {
|
||||
//common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
|
||||
// variant: happy path for when it works as the model card says it should
|
||||
const std::string variant("simple");
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string input = "<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto msg = common_chat_parse(input, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 1, msg.tool_calls.size());
|
||||
assert_equals(variant, std::string("get_time"), msg.tool_calls[0].name);
|
||||
// JSON arguments are dumped without spaces
|
||||
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), msg.tool_calls[0].arguments);
|
||||
assert_equals(variant, std::string(""), msg.content);
|
||||
assert_equals(variant, std::string(""), msg.reasoning_content);
|
||||
|
||||
// variant: simple + thinking open
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("simple_thinking");
|
||||
const std::string in = "REASONING</think><|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
|
||||
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
|
||||
assert_equals(variant, std::string(""), m.content);
|
||||
assert_equals(variant, std::string("REASONING"), m.reasoning_content);
|
||||
}
|
||||
// variant: simple + multiple tool calls
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("simple_multiple_tool_calls");
|
||||
const std::string in = "CONTENT<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 2, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
|
||||
assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[0].arguments);
|
||||
assert_equals(variant, std::string("get_weather"), m.tool_calls[1].name);
|
||||
assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[1].arguments);
|
||||
assert_equals(variant, std::string("CONTENT"), m.content);
|
||||
assert_equals(variant, std::string(""), m.reasoning_content);
|
||||
}
|
||||
|
||||
|
||||
// variant: thinking forced open + tool call in reasoning content
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("thinking_forced_open_tool_call_in_reasoning");
|
||||
const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING</think><|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
|
||||
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
|
||||
assert_equals(variant, std::string(""), m.content);
|
||||
assert_equals(variant, std::string("REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING"), m.reasoning_content);
|
||||
}
|
||||
|
||||
// variant: thinking forced open + tool call in reasoning content + no closing think + not partial
|
||||
// This is a bit of a fine tuning issue on the model's part IMO. It really should not be attempting
|
||||
// to make tool calls in reasoning content according to the model card, but it does sometimes, so
|
||||
// add the reasoning content as regular content and parse the tool calls.
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_not_partial");
|
||||
const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals(variant, std::string("REASONING"), m.content);
|
||||
assert_equals(variant, std::string(""), m.reasoning_content);
|
||||
assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
|
||||
assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
|
||||
}
|
||||
|
||||
// variant: thinking forced open + tool call in reasoning content + no closing think + partial
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_partial");
|
||||
const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>";
|
||||
auto m = common_chat_parse(in, /* is_partial= */ true, syntax);
|
||||
assert_equals(variant, std::string("REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"), m.reasoning_content);
|
||||
assert_equals(variant, std::string(""), m.content);
|
||||
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
|
||||
}
|
||||
|
||||
// variant: thinking not forced open + reasoning + regular content + no tool calls
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("thinking_forced_open_reasoning_regular_content_no_tool_calls");
|
||||
const std::string in = "REASONING</think>CONTENT";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("CONTENT"), m.content);
|
||||
assert_equals(variant, std::string("REASONING"), m.reasoning_content);
|
||||
}
|
||||
// variant: thinking not forced open + missing reasoning + no tool calls
|
||||
{
|
||||
common_chat_syntax syntax = {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
};
|
||||
const std::string variant("thinking_not_forced_open_missing_reasoning_no_tool_calls");
|
||||
const std::string in = "CONTENT";
|
||||
auto m = common_chat_parse(in, false, syntax);
|
||||
assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
|
||||
assert_equals(variant, std::string("CONTENT"), m.content);
|
||||
assert_equals(variant, std::string(""), m.reasoning_content);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_with_args(const std::string & input, const std::string & expected, bool parse_as_partial = true, bool is_partial = true) {
|
||||
common_chat_msg_parser builder(input, parse_as_partial, {});
|
||||
auto js = builder.try_consume_json_with_dumped_args({{"args"}}, {});
|
||||
@@ -347,6 +537,7 @@ int main() {
|
||||
test_json_with_dumped_args();
|
||||
test_reasoning();
|
||||
test_regex();
|
||||
test_deepseek_v3_1_tool_calls();
|
||||
std::cout << "All tests passed!\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
+136
-1
@@ -1757,7 +1757,6 @@ static void test_template_output_parsers() {
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_SEED_OSS}));
|
||||
}
|
||||
|
||||
{
|
||||
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-Nano-v2.jinja");
|
||||
std::vector<std::string> end_tokens{ "<SPECIAL_12>" };
|
||||
@@ -1828,6 +1827,142 @@ static void test_template_output_parsers() {
|
||||
/* expect_grammar_triggered= */ true
|
||||
);
|
||||
}
|
||||
{
|
||||
auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-V3.1.jinja");
|
||||
std::vector<std::string> end_tokens{ "<|end▁of▁sentence|>" };
|
||||
|
||||
for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, params.format);
|
||||
assert_equals(true, params.thinking_forced_open);
|
||||
}
|
||||
|
||||
test_templates(tmpls.get(), end_tokens, message_assist, tools, "</think>Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "</think>Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
|
||||
common_chat_parse(
|
||||
"I'm\nthinking</think>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
}));
|
||||
// variant: thinking forced open, reasoning_format none
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("REASONING</think>ok", ""),
|
||||
common_chat_parse(
|
||||
"REASONING</think>ok",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: happy path for when it works as the model card says it should
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("", "", "get_time", "{\"city\":\"Tokyo\"}"),
|
||||
common_chat_parse(
|
||||
"<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: simple + thinking open
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("", "REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
|
||||
common_chat_parse(
|
||||
"REASONING</think><|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: simple + multiple tool calls
|
||||
common_chat_msg message_assist_multiple_calls;
|
||||
message_assist_multiple_calls.role = "assistant";
|
||||
message_assist_multiple_calls.content = "CONTENT";
|
||||
message_assist_multiple_calls.tool_calls.push_back({"get_time", "{\"city\":\"Paris\"}", ""});
|
||||
message_assist_multiple_calls.tool_calls.push_back({"get_weather", "{\"city\":\"Paris\"}", ""});
|
||||
assert_msg_equals(
|
||||
message_assist_multiple_calls,
|
||||
common_chat_parse(
|
||||
"CONTENT<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: thinking forced open + tool call in reasoning content
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("", "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
|
||||
common_chat_parse(
|
||||
"REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING</think><|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: thinking forced open + tool call in reasoning content + no closing think + not partial
|
||||
// This is a bit of a fine tuning issue on the model's part IMO. It really should not be attempting
|
||||
// to make tool calls in reasoning content according to the model card, but it does sometimes, so
|
||||
// add the reasoning content as regular content and parse the tool calls.
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("REASONING", "", "get_time", "{\"city\":\"Tokyo\"}"),
|
||||
common_chat_parse(
|
||||
"REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: thinking forced open + tool call in reasoning content + no closing think + partial
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("", "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", "", ""),
|
||||
common_chat_parse(
|
||||
"REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>",
|
||||
/* is_partial= */ true,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ true,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
// variant: thinking not forced open + missing reasoning + no tool calls
|
||||
assert_msg_equals(
|
||||
simple_assist_msg("CONTENT", ""),
|
||||
common_chat_parse(
|
||||
"CONTENT",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* .reasoning_in_content = */ false,
|
||||
/* .thinking_forced_open = */ false,
|
||||
/* .parse_tool_calls = */ true,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
static void test_msg_diffs_compute() {
|
||||
|
||||
@@ -1209,6 +1209,51 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"allOf with enum schema",
|
||||
R"""({
|
||||
"allOf": [
|
||||
{"$ref": "#/definitions/foo"}
|
||||
],
|
||||
"definitions": {
|
||||
"foo": {
|
||||
"type": "string",
|
||||
"enum": ["a", "b"]
|
||||
}
|
||||
}
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("\"a\"" | "\"b\"") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"allOf with multiple enum schemas",
|
||||
R"""({
|
||||
"allOf": [
|
||||
{"$ref": "#/definitions/foo"},
|
||||
{"$ref": "#/definitions/bar"}
|
||||
],
|
||||
"definitions": {
|
||||
"foo": {
|
||||
"type": "string",
|
||||
"enum": ["a", "b", "c"]
|
||||
},
|
||||
"bar": {
|
||||
"type": "string",
|
||||
"enum": ["b", "c", "d"]
|
||||
}
|
||||
}
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("\"b\"" | "\"c\"") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"conflicting names",
|
||||
|
||||
@@ -421,10 +421,10 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
|
||||
if text1 == text2: # equal to TokenizerGroundtruth?
|
||||
return True
|
||||
# equal to source text?
|
||||
if tokenizer1.add_bos_token: # remove BOS
|
||||
if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str): # remove BOS
|
||||
if text2.startswith(tokenizer1.bos_token):
|
||||
text2 = text2[len(tokenizer1.bos_token):]
|
||||
if tokenizer1.add_eos_token: # remove EOS
|
||||
if tokenizer1.add_eos_token and tokenizer1.eos_token and isinstance(tokenizer1.eos_token, str): # remove EOS
|
||||
if text2.endswith(tokenizer1.eos_token):
|
||||
text2 = text2[:-len(tokenizer1.eos_token)]
|
||||
return text == text2
|
||||
|
||||
@@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch, bool synchronize) {
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
@@ -91,7 +91,9 @@ int main(int argc, char ** argv) {
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_synchronize(ctx);
|
||||
if (synchronize) {
|
||||
llama_synchronize(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -103,7 +105,7 @@ int main(int argc, char ** argv) {
|
||||
common_batch_add(batch, get_token_rand(), i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -138,15 +140,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_memory_clear(mem, false);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_synchronize(ctx);
|
||||
|
||||
const auto t_pp_end = ggml_time_us();
|
||||
|
||||
if (is_pp_shared) {
|
||||
@@ -158,7 +162,7 @@ int main(int argc, char ** argv) {
|
||||
// run one dummy token to apply the memory copy
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -175,7 +179,7 @@ int main(int argc, char ** argv) {
|
||||
common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ import warnings
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
from torch.nn.init import _calculate_fan_in_and_fan_out
|
||||
|
||||
@@ -413,7 +412,8 @@ import re
|
||||
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
|
||||
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
|
||||
from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
-r ../../requirements/requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pillow~=11.3.0
|
||||
torch~=2.4.0
|
||||
torchvision~=0.19.1
|
||||
torch~=2.6.0
|
||||
torchvision~=0.21.0
|
||||
|
||||
@@ -512,6 +512,8 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||
|
||||
`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 3 values: `total`, `cache` and `processed`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. Default: `false`
|
||||
|
||||
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
|
||||
|
||||
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
|
||||
@@ -1276,6 +1278,34 @@ curl http://localhost:8080/v1/chat/completions \
|
||||
|
||||
**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
|
||||
|
||||
*Timings and context usage*
|
||||
|
||||
The response contains a `timings` object, for example:
|
||||
|
||||
```js
|
||||
{
|
||||
"choices": [],
|
||||
"created": 1757141666,
|
||||
"id": "chatcmpl-ecQULm0WqPrftUqjPZO1CFYeDjGZNbDu",
|
||||
// ...
|
||||
"timings": {
|
||||
"cache_n": 236, // number of prompt tokens reused from cache
|
||||
"prompt_n": 1, // number of prompt tokens being processed
|
||||
"prompt_ms": 30.958,
|
||||
"prompt_per_token_ms": 30.958,
|
||||
"prompt_per_second": 32.301828283480845,
|
||||
"predicted_n": 35, // number of predicted tokens
|
||||
"predicted_ms": 661.064,
|
||||
"predicted_per_token_ms": 18.887542857142858,
|
||||
"predicted_per_second": 52.94494935437416
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This provides information on the performance of the server. It also allows calculating the current context usage.
|
||||
|
||||
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
||||
|
||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||
|
||||
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
||||
|
||||
@@ -631,9 +631,10 @@ export class SchemaConverter {
|
||||
const required = new Set(schema.required || []);
|
||||
const properties = Object.entries(schema.properties ?? {});
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
|
||||
} else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) {
|
||||
} else if ((schemaType === undefined || schemaType === 'object' || schemaType === 'string') && 'allOf' in schema) {
|
||||
const required = new Set();
|
||||
const properties = [];
|
||||
const enumSets = [];
|
||||
const addComponent = (compSchema, isRequired) => {
|
||||
const ref = compSchema.$ref;
|
||||
if (ref !== undefined) {
|
||||
@@ -648,6 +649,10 @@ export class SchemaConverter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ('enum' in compSchema) {
|
||||
enumSets.push(new Set(compSchema.enum || []));
|
||||
}
|
||||
};
|
||||
|
||||
for (const t of schema.allOf) {
|
||||
@@ -660,6 +665,14 @@ export class SchemaConverter {
|
||||
}
|
||||
}
|
||||
|
||||
if (enumSets.length > 0) {
|
||||
const enumIntersection = new Set([...enumSets[0]].filter(v => enumSets.every(s => s.has(v))));
|
||||
if (enumIntersection.size > 0) {
|
||||
const sortedEnums = [...enumIntersection].sort((a, b) => a.localeCompare(b));
|
||||
const rule = '(' + sortedEnums.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
|
||||
return this._addRule(ruleName, rule);
|
||||
}
|
||||
}
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
|
||||
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
|
||||
const items = schema.items ?? schema.prefixItems;
|
||||
|
||||
+74
-17
@@ -110,9 +110,10 @@ static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
}
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool return_progress = false;
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
@@ -312,6 +313,7 @@ struct server_task {
|
||||
params.stream = json_value(data, "stream", false);
|
||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||
params.return_tokens = json_value(data, "return_tokens", false);
|
||||
params.return_progress = json_value(data, "return_progress", false);
|
||||
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
||||
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||
@@ -608,6 +610,8 @@ struct server_task {
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
int32_t cache_n = -1;
|
||||
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
@@ -624,6 +628,8 @@ struct result_timings {
|
||||
|
||||
json to_json() const {
|
||||
json base = {
|
||||
{"cache_n", cache_n},
|
||||
|
||||
{"prompt_n", prompt_n},
|
||||
{"prompt_ms", prompt_ms},
|
||||
{"prompt_per_token_ms", prompt_per_token_ms},
|
||||
@@ -644,6 +650,22 @@ struct result_timings {
|
||||
}
|
||||
};
|
||||
|
||||
struct result_prompt_progress {
|
||||
int32_t total = 0;
|
||||
int32_t cache = 0;
|
||||
int32_t processed = 0;
|
||||
int64_t time_ms = 0;
|
||||
|
||||
json to_json() const {
|
||||
return json {
|
||||
{"total", total},
|
||||
{"cache", cache},
|
||||
{"processed", processed},
|
||||
{"time_ms", time_ms},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
@@ -999,8 +1021,10 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
bool post_sampling_probs;
|
||||
bool is_progress = false;
|
||||
completion_token_output prob_output;
|
||||
result_timings timings;
|
||||
result_prompt_progress progress;
|
||||
|
||||
// OAI-compat fields
|
||||
bool verbose = false;
|
||||
@@ -1045,6 +1069,9 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
if (timings.prompt_n > 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
res.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
if (!prob_output.probs.empty()) {
|
||||
res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
|
||||
}
|
||||
@@ -1082,6 +1109,9 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
if (timings.prompt_n >= 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
res.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -1109,7 +1139,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
});
|
||||
};
|
||||
// We have to send an initial update to conform to openai behavior
|
||||
if (first) {
|
||||
if (first || is_progress) {
|
||||
add_delta({
|
||||
{"role", "assistant"},
|
||||
{"content", nullptr},
|
||||
@@ -1121,16 +1151,20 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
}
|
||||
|
||||
if (!deltas.empty()) {
|
||||
GGML_ASSERT(deltas[deltas.size() - 1].at("choices").size() >= 1);
|
||||
auto & last_json = deltas[deltas.size() - 1];
|
||||
GGML_ASSERT(last_json.at("choices").size() >= 1);
|
||||
|
||||
if (prob_output.probs.size() > 0) {
|
||||
deltas[deltas.size() - 1].at("choices").at(0)["logprobs"] = json {
|
||||
last_json.at("choices").at(0)["logprobs"] = json {
|
||||
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
||||
};
|
||||
}
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
deltas[deltas.size() - 1].push_back({"timings", timings.to_json()});
|
||||
last_json.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
last_json.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1404,6 +1438,7 @@ struct server_slot {
|
||||
|
||||
// n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated
|
||||
int32_t n_prompt_tokens = 0;
|
||||
int32_t n_prompt_tokens_cache = 0;
|
||||
int32_t n_prompt_tokens_processed = 0;
|
||||
|
||||
// input prompt tokens
|
||||
@@ -1456,7 +1491,9 @@ struct server_slot {
|
||||
void reset() {
|
||||
SLT_DBG(*this, "%s", "\n");
|
||||
|
||||
n_prompt_tokens = 0;
|
||||
n_prompt_tokens = 0;
|
||||
n_prompt_tokens_cache = 0;
|
||||
|
||||
last_nl_pos = 0;
|
||||
generated_text = "";
|
||||
has_new_line = false;
|
||||
@@ -1547,6 +1584,8 @@ struct server_slot {
|
||||
|
||||
result_timings get_timings() const {
|
||||
result_timings timings;
|
||||
timings.cache_n = n_prompt_tokens_cache;
|
||||
|
||||
timings.prompt_n = n_prompt_tokens_processed;
|
||||
timings.prompt_ms = t_prompt_processing;
|
||||
timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
|
||||
@@ -2520,7 +2559,7 @@ struct server_context {
|
||||
|
||||
slot.add_token(result);
|
||||
if (slot.params.stream) {
|
||||
send_partial_response(slot, result);
|
||||
send_partial_response(slot, result, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2712,13 +2751,24 @@ struct server_context {
|
||||
return true;
|
||||
}
|
||||
|
||||
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
|
||||
void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
|
||||
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
||||
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
res->content = tkn.text_to_send;
|
||||
res->tokens = { tkn.tok };
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
|
||||
if (is_progress) {
|
||||
res->is_progress = true;
|
||||
res->progress.total = slot.n_prompt_tokens;
|
||||
res->progress.cache = slot.n_prompt_tokens_cache;
|
||||
res->progress.processed = slot.cache_tokens.size();
|
||||
res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000);
|
||||
} else {
|
||||
res->content = tkn.text_to_send;
|
||||
res->tokens = { tkn.tok };
|
||||
|
||||
slot.update_chat_msg(res->oaicompat_msg_diffs);
|
||||
}
|
||||
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
@@ -2729,8 +2779,6 @@ struct server_context {
|
||||
res->oaicompat_model = slot.params.oaicompat_model;
|
||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||
|
||||
slot.update_chat_msg(res->oaicompat_msg_diffs);
|
||||
|
||||
// populate res.probs_output
|
||||
if (slot.params.sampling.n_probs > 0) {
|
||||
res->prob_output = tkn; // copy the token probs
|
||||
@@ -3557,6 +3605,7 @@ struct server_context {
|
||||
slot.n_past--;
|
||||
}
|
||||
|
||||
slot.n_prompt_tokens_cache = slot.n_past;
|
||||
slot.n_prompt_tokens_processed = 0;
|
||||
}
|
||||
|
||||
@@ -3573,7 +3622,8 @@ struct server_context {
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_past = 0;
|
||||
slot.n_past = 0;
|
||||
slot.n_prompt_tokens_cache = 0;
|
||||
}
|
||||
|
||||
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||
@@ -3767,6 +3817,13 @@ struct server_context {
|
||||
n_batch = llama_n_batch(ctx);
|
||||
|
||||
for (auto & slot : slots) {
|
||||
// optionally send prompt processing progress
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
|
||||
if (slot.params.stream && slot.params.return_progress) {
|
||||
send_partial_response(slot, {}, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
aiohttp~=3.9.3
|
||||
pytest~=8.3.3
|
||||
huggingface_hub~=0.23.2
|
||||
huggingface_hub>=0.34.0,<1.0
|
||||
numpy~=1.26.4
|
||||
openai~=1.55.3
|
||||
prometheus-client~=0.20.0
|
||||
|
||||
@@ -5,6 +5,12 @@ from utils import *
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def do_something():
|
||||
# this will be run once per test session, before any tests
|
||||
ServerPreset.load_all()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
|
||||
@@ -402,3 +402,51 @@ def test_context_size_exceeded():
|
||||
assert server.n_ctx is not None
|
||||
assert server.n_slots is not None
|
||||
assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_batch,batch_count,reuse_cache",
|
||||
[
|
||||
(64, 15, False),
|
||||
(64, 1, True),
|
||||
]
|
||||
)
|
||||
def test_return_progresssss(n_batch, batch_count, reuse_cache):
|
||||
global server
|
||||
server.n_batch = n_batch
|
||||
server.n_ctx = 2048
|
||||
server.n_slots = 1
|
||||
server.start()
|
||||
def make_cmpl_request():
|
||||
return server.make_stream_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 10,
|
||||
"messages": [
|
||||
{"role": "user", "content": "This is a test" * 100},
|
||||
],
|
||||
"stream": True,
|
||||
"return_progress": True,
|
||||
})
|
||||
if reuse_cache:
|
||||
# make a first request to populate the cache
|
||||
res0 = make_cmpl_request()
|
||||
for _ in res0:
|
||||
pass # discard the output
|
||||
|
||||
res = make_cmpl_request()
|
||||
last_progress = None
|
||||
total_batch_count = 0
|
||||
for data in res:
|
||||
cur_progress = data.get("prompt_progress", None)
|
||||
if cur_progress is None:
|
||||
continue
|
||||
if last_progress is not None:
|
||||
assert cur_progress["total"] == last_progress["total"]
|
||||
assert cur_progress["cache"] == last_progress["cache"]
|
||||
assert cur_progress["processed"] > last_progress["processed"]
|
||||
total_batch_count += 1
|
||||
last_progress = cur_progress
|
||||
|
||||
assert last_progress is not None
|
||||
assert last_progress["total"] > 0
|
||||
assert last_progress["processed"] == last_progress["total"]
|
||||
assert total_batch_count == batch_count
|
||||
|
||||
@@ -14,14 +14,11 @@ from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
TIMEOUT_SERVER_START = 15*60
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.model_alias = "tinyllama-2"
|
||||
server.server_port = 8081
|
||||
server.n_slots = 1
|
||||
|
||||
|
||||
@@ -45,7 +42,7 @@ def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expe
|
||||
server.jinja = True
|
||||
server.reasoning_budget = reasoning_budget
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
@@ -68,7 +65,7 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]):
|
||||
global server
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
@@ -91,7 +88,7 @@ def test_add_generation_prompt(template_name: str, expected_generation_prompt: s
|
||||
global server
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
|
||||
@@ -12,7 +12,7 @@ from enum import Enum
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
TIMEOUT_SERVER_START = 15*60
|
||||
TIMEOUT_START_SLOW = 15 * 60 # this is needed for real model tests
|
||||
TIMEOUT_HTTP_REQUEST = 60
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -124,7 +124,7 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
|
||||
|
||||
|
||||
@@ -168,7 +168,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -240,7 +240,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
body = server.make_any_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
@@ -295,7 +295,7 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
|
||||
server.n_predict = n_predict
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -317,7 +317,7 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
|
||||
server.n_predict = n_predict
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
|
||||
|
||||
|
||||
@@ -436,7 +436,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -524,7 +524,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
body = server.make_any_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
@@ -597,7 +597,7 @@ def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | Non
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
|
||||
do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
|
||||
|
||||
|
||||
@@ -5,18 +5,31 @@ import requests
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
|
||||
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
|
||||
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
IMG_BASE64_URI_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
IMG_BASE64_0 = base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
IMG_BASE64_URI_1 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
IMG_BASE64_1 = base64.b64encode(response.content).decode("utf-8")
|
||||
def get_img_url(id: str) -> str:
|
||||
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
|
||||
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
|
||||
if id == "IMG_URL_0":
|
||||
return IMG_URL_0
|
||||
elif id == "IMG_URL_1":
|
||||
return IMG_URL_1
|
||||
elif id == "IMG_BASE64_URI_0":
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_0":
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_URI_1":
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_1":
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return base64.b64encode(response.content).decode("utf-8")
|
||||
else:
|
||||
return id
|
||||
|
||||
JSON_MULTIMODAL_KEY = "multimodal_data"
|
||||
JSON_PROMPT_STRING_KEY = "prompt_string"
|
||||
@@ -28,7 +41,7 @@ def create_server():
|
||||
|
||||
def test_models_supports_multimodal_capability():
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("GET", "/models", data={})
|
||||
assert res.status_code == 200
|
||||
model_info = res.body["models"][0]
|
||||
@@ -38,7 +51,7 @@ def test_models_supports_multimodal_capability():
|
||||
|
||||
def test_v1_models_supports_multimodal_capability():
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("GET", "/v1/models", data={})
|
||||
assert res.status_code == 200
|
||||
model_info = res.body["models"][0]
|
||||
@@ -50,10 +63,10 @@ def test_v1_models_supports_multimodal_capability():
|
||||
"prompt, image_url, success, re_content",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this:\n", IMG_URL_0, True, "(cat)+"),
|
||||
("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"), # exceptional, so that we don't cog up the log
|
||||
("What is this:\n", IMG_URL_1, True, "(frog)+"),
|
||||
("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache
|
||||
("What is this:\n", "IMG_URL_0", True, "(cat)+"),
|
||||
("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"),
|
||||
("What is this:\n", "IMG_URL_1", True, "(frog)+"),
|
||||
("Test test\n", "IMG_URL_1", True, "(frog)+"), # test invalidate cache
|
||||
("What is this:\n", "malformed", False, None),
|
||||
("What is this:\n", "https://google.com/404", False, None), # non-existent image
|
||||
("What is this:\n", "https://ggml.ai", False, None), # non-image data
|
||||
@@ -62,9 +75,7 @@ def test_v1_models_supports_multimodal_capability():
|
||||
)
|
||||
def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
global server
|
||||
server.start(timeout_seconds=60) # vision model may take longer to load due to download size
|
||||
if image_url == "IMG_BASE64_URI_0":
|
||||
image_url = IMG_BASE64_URI_0
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
@@ -72,7 +83,7 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": image_url,
|
||||
"url": get_img_url(image_url),
|
||||
}},
|
||||
]},
|
||||
],
|
||||
@@ -90,19 +101,22 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
"prompt, image_data, success, re_content",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this: <__media__>\n", IMG_BASE64_0, True, "(cat)+"),
|
||||
("What is this: <__media__>\n", IMG_BASE64_1, True, "(frog)+"),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_0", True, "(cat)+"),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_1", True, "(frog)+"),
|
||||
("What is this: <__media__>\n", "malformed", False, None), # non-image data
|
||||
("What is this:\n", "", False, None), # empty string
|
||||
]
|
||||
)
|
||||
def test_vision_completion(prompt, image_data, success, re_content):
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"prompt": { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
|
||||
"prompt": {
|
||||
JSON_PROMPT_STRING_KEY: prompt,
|
||||
JSON_MULTIMODAL_KEY: [ get_img_url(image_data) ],
|
||||
},
|
||||
})
|
||||
if success:
|
||||
assert res.status_code == 200
|
||||
@@ -116,17 +130,18 @@ def test_vision_completion(prompt, image_data, success, re_content):
|
||||
"prompt, image_data, success",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this: <__media__>\n", IMG_BASE64_0, True), # exceptional, so that we don't cog up the log
|
||||
("What is this: <__media__>\n", IMG_BASE64_1, True),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_0", True),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_1", True),
|
||||
("What is this: <__media__>\n", "malformed", False), # non-image data
|
||||
("What is this:\n", "base64", False), # non-image data
|
||||
]
|
||||
)
|
||||
def test_vision_embeddings(prompt, image_data, success):
|
||||
global server
|
||||
server.server_embeddings=True
|
||||
server.n_batch=512
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.server_embeddings = True
|
||||
server.n_batch = 512
|
||||
server.start()
|
||||
image_data = get_img_url(image_data)
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"content": [
|
||||
{ JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
|
||||
|
||||
@@ -26,7 +26,7 @@ from re import RegexFlag
|
||||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 30
|
||||
DEFAULT_HTTP_TIMEOUT = 60
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
@@ -45,6 +45,7 @@ class ServerProcess:
|
||||
model_alias: str = "tinyllama-2"
|
||||
temperature: float = 0.8
|
||||
seed: int = 42
|
||||
offline: bool = False
|
||||
|
||||
# custom options
|
||||
model_alias: str | None = None
|
||||
@@ -118,6 +119,8 @@ class ServerProcess:
|
||||
"--seed",
|
||||
self.seed,
|
||||
]
|
||||
if self.offline:
|
||||
server_args.append("--offline")
|
||||
if self.model_file:
|
||||
server_args.extend(["--model", self.model_file])
|
||||
if self.model_url:
|
||||
@@ -392,6 +395,19 @@ server_instances: Set[ServerProcess] = set()
|
||||
|
||||
|
||||
class ServerPreset:
|
||||
@staticmethod
|
||||
def load_all() -> None:
|
||||
""" Load all server presets to ensure model files are cached. """
|
||||
servers: List[ServerProcess] = [
|
||||
method()
|
||||
for name, method in ServerPreset.__dict__.items()
|
||||
if callable(method) and name != "load_all"
|
||||
]
|
||||
for server in servers:
|
||||
server.offline = False
|
||||
server.start()
|
||||
server.stop()
|
||||
|
||||
@staticmethod
|
||||
def tinyllama2() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
@@ -408,6 +424,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -422,6 +439,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -437,6 +455,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
|
||||
server.model_alias = "tinyllama-infill"
|
||||
@@ -451,6 +470,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def stories15m_moe() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/stories15M_MOE"
|
||||
server.model_hf_file = "stories15M_MOE-F16.gguf"
|
||||
server.model_alias = "stories15m-moe"
|
||||
@@ -465,6 +485,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def jina_reranker_tiny() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||
server.model_alias = "jina-reranker"
|
||||
@@ -478,6 +499,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinygemma3() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
# mmproj is already provided by HF registry API
|
||||
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
|
||||
server.model_hf_file = "tinygemma3-Q8_0.gguf"
|
||||
|
||||
Reference in New Issue
Block a user