mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3b15924d71 | |||
| 79bc429262 | |||
| c4df49a42d | |||
| 3c3635d2f2 | |||
| 61bdfd5298 | |||
| 01806e7771 | |||
| 186415d595 | |||
| fd621880f3 | |||
| 4281c7b315 | |||
| 5fac79cbc7 | |||
| 408ff524b4 | |||
| 5143fa895e | |||
| 3a550b5ca4 | |||
| a81283820a | |||
| c610b6c11b | |||
| 5d6688de08 | |||
| 4fd1242bef | |||
| b2426e469e | |||
| 9e2b1e83c6 | |||
| fb15d649ed | |||
| 856ed0947f | |||
| d1e2adba65 | |||
| c1c354e44c |
@@ -17,7 +17,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
|
||||
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
|
||||
+42
-22
@@ -1263,6 +1263,18 @@ static std::string list_builtin_chat_templates() {
|
||||
return msg.str();
|
||||
}
|
||||
|
||||
static bool is_truthy(const std::string & value) {
|
||||
return value == "on" || value == "enabled" || value == "1";
|
||||
}
|
||||
|
||||
static bool is_falsey(const std::string & value) {
|
||||
return value == "off" || value == "disabled" || value == "0";
|
||||
}
|
||||
|
||||
static bool is_autoy(const std::string & value) {
|
||||
return value == "auto" || value == "-1";
|
||||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
@@ -1544,21 +1556,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.n_chunks = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
{"-fa", "--flash-attn"}, "FA",
|
||||
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (value == "on" || value == "enabled" || value == "1") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||
} else if (value == "off" || value == "disabled" || value == "0") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
} else if (value == "auto" || value == "-1") {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
} else {
|
||||
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
||||
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
|
||||
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
|
||||
llama_flash_attn_type_name(params.flash_attn_type)),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||
} else if (is_falsey(value)) {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
} else if (is_autoy(value)) {
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
||||
}
|
||||
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
||||
add_opt(common_arg(
|
||||
{"-p", "--prompt"}, "PROMPT",
|
||||
"prompt to start generation with; for system message, use -sys",
|
||||
@@ -3134,13 +3146,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"},
|
||||
"Enable colored logging",
|
||||
[](common_params &) {
|
||||
common_log_set_colors(common_log_main(), true);
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
"'auto' enables colors when output is to a terminal",
|
||||
[](common_params &, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
|
||||
} else if (is_falsey(value)) {
|
||||
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
|
||||
} else if (is_autoy(value)) {
|
||||
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
||||
} else {
|
||||
throw std::invalid_argument(
|
||||
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}).set_env("LLAMA_LOG_COLORS"));
|
||||
add_opt(common_arg(
|
||||
{"-v", "--verbose", "--log-verbose"},
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
|
||||
+111
-1
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
||||
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
||||
}
|
||||
|
||||
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
||||
common_chat_templates_inputs dummy_inputs;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "test";
|
||||
dummy_inputs.messages = {msg};
|
||||
dummy_inputs.enable_thinking = false;
|
||||
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
||||
dummy_inputs.enable_thinking = true;
|
||||
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
||||
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
||||
std::vector<common_chat_msg> msgs;
|
||||
@@ -623,6 +636,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
||||
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
||||
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
||||
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
||||
default:
|
||||
throw std::runtime_error("Unknown chat format");
|
||||
}
|
||||
@@ -1184,6 +1198,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
||||
});
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// Generate the prompt using the apply() function with the template
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
|
||||
|
||||
// Handle thinking tags appropriately based on inputs.enable_thinking
|
||||
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
data.thinking_forced_open = true;
|
||||
}
|
||||
}
|
||||
|
||||
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
|
||||
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = true;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
auto schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
schemas.push_back({
|
||||
{ "type", "object" },
|
||||
{ "properties",
|
||||
{
|
||||
{ "name",
|
||||
{
|
||||
{ "type", "string" },
|
||||
{ "const", function.at("name") },
|
||||
} },
|
||||
{ "arguments", function.at("parameters") },
|
||||
} },
|
||||
{ "required", json::array({ "name", "arguments" }) },
|
||||
});
|
||||
});
|
||||
auto schema = json{
|
||||
{ "type", "array" },
|
||||
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
||||
{ "minItems", 1 },
|
||||
};
|
||||
if (!inputs.parallel_tool_calls) {
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root",
|
||||
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
||||
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
|
||||
" \"</TOOLCALL>\"");
|
||||
});
|
||||
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
||||
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
||||
std::string(data.thinking_forced_open ?
|
||||
"[\\s\\S]*?(</think>\\s*)" :
|
||||
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
||||
"(<TOOLCALL>)[\\s\\S]*" });
|
||||
}
|
||||
return data;
|
||||
}
|
||||
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
@@ -1830,7 +1905,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
||||
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
||||
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
||||
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
|
||||
"(\\s*"
|
||||
"\\s*("
|
||||
"(?:<tool_call>"
|
||||
"|<function"
|
||||
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
||||
@@ -2060,6 +2135,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
||||
}
|
||||
}
|
||||
|
||||
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
||||
// Parse thinking tags
|
||||
builder.try_parse_reasoning("<think>", "</think>");
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
}
|
||||
|
||||
// Look for tool calls
|
||||
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
|
||||
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
||||
builder.move_to(res->groups[0].end);
|
||||
|
||||
// Expect JSON array of tool calls
|
||||
auto tool_calls_data = builder.consume_json();
|
||||
if (tool_calls_data.json.is_array()) {
|
||||
if (!builder.try_consume_literal("</TOOLCALL>")) {
|
||||
throw common_chat_msg_partial_exception("Incomplete tool call");
|
||||
}
|
||||
builder.add_tool_calls(tool_calls_data.json);
|
||||
} else {
|
||||
throw common_chat_msg_partial_exception("Incomplete tool call");
|
||||
}
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
||||
// Parse thinking tags first - this handles the main reasoning content
|
||||
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
||||
@@ -2293,6 +2395,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
||||
}
|
||||
|
||||
// Nemotron v2
|
||||
if (src.find("<SPECIAL_10>") != std::string::npos) {
|
||||
return common_chat_params_init_nemotron_v2(tmpl, params);
|
||||
}
|
||||
|
||||
// Use generic handler when mixing tools + JSON schema.
|
||||
// TODO: support that mix in handlers below.
|
||||
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
||||
@@ -2454,6 +2561,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||
case COMMON_CHAT_FORMAT_SEED_OSS:
|
||||
common_chat_parse_seed_oss(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
||||
common_chat_parse_nemotron_v2(builder);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
||||
}
|
||||
|
||||
@@ -112,6 +112,7 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_GRANITE,
|
||||
COMMON_CHAT_FORMAT_GPT_OSS,
|
||||
COMMON_CHAT_FORMAT_SEED_OSS,
|
||||
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
@@ -198,6 +199,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
||||
|
||||
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
||||
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
||||
|
||||
+53
-2
@@ -4,17 +4,52 @@
|
||||
#include <condition_variable>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_WIN32)
|
||||
# include <io.h>
|
||||
# include <windows.h>
|
||||
# define isatty _isatty
|
||||
# define fileno _fileno
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
// Auto-detect if colors should be enabled based on terminal and environment
|
||||
static bool common_log_should_use_colors_auto() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
static int64_t t_us() {
|
||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
|
||||
|
||||
struct common_log * common_log_main() {
|
||||
static struct common_log log;
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [&]() {
|
||||
// Set default to auto-detect colors
|
||||
log.set_colors(common_log_should_use_colors_auto());
|
||||
});
|
||||
|
||||
return &log;
|
||||
}
|
||||
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||
log->set_file(file);
|
||||
}
|
||||
|
||||
void common_log_set_colors(struct common_log * log, bool colors) {
|
||||
log->set_colors(colors);
|
||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||
if (colors == LOG_COLORS_AUTO) {
|
||||
log->set_colors(common_log_should_use_colors_auto());
|
||||
return;
|
||||
}
|
||||
|
||||
if (colors == LOG_COLORS_DISABLED) {
|
||||
log->set_colors(false);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
|
||||
log->set_colors(true);
|
||||
}
|
||||
|
||||
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
||||
|
||||
+10
-4
@@ -24,6 +24,12 @@
|
||||
#define LOG_DEFAULT_DEBUG 1
|
||||
#define LOG_DEFAULT_LLAMA 0
|
||||
|
||||
enum log_colors {
|
||||
LOG_COLORS_AUTO = -1,
|
||||
LOG_COLORS_DISABLED = 0,
|
||||
LOG_COLORS_ENABLED = 1,
|
||||
};
|
||||
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via common_log_set_verbosity()
|
||||
extern int common_log_verbosity_thold;
|
||||
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||
//
|
||||
|
||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||
|
||||
// helper macros for logging
|
||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||
|
||||
@@ -5122,6 +5122,15 @@ class Gemma3Model(TextModel):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3TextModel")
|
||||
class EmbeddingGemma(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3ForConditionalGeneration")
|
||||
class Gemma3VisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
|
||||
+27
-1
@@ -12,7 +12,7 @@ import json
|
||||
from math import prod
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||
from transformers import AutoConfig
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
import torch
|
||||
|
||||
@@ -26,6 +26,8 @@ import gguf
|
||||
# reuse model definitions from convert_hf_to_gguf.py
|
||||
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
|
||||
|
||||
from gguf.constants import GGUFValueType
|
||||
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
|
||||
@@ -369,7 +371,31 @@ if __name__ == '__main__':
|
||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||
alora_invocation_tokens = lparams.get("alora_invocation_tokens")
|
||||
invocation_string = lparams.get("invocation_string")
|
||||
if invocation_string and not alora_invocation_tokens:
|
||||
logger.debug("Tokenizing invocation_string -> alora_invocation_tokens")
|
||||
base_model_path_or_id = hparams.get("_name_or_path")
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id)
|
||||
except ValueError:
|
||||
logger.error("Unable to load tokenizer from %s", base_model_path_or_id)
|
||||
raise
|
||||
# NOTE: There's an off-by-one with the older aLoRAs where
|
||||
# the invocation string includes the "<|start_of_turn|>"
|
||||
# token, but the adapters themselves were trained to
|
||||
# activate _after_ that first token, so we drop it here.
|
||||
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
|
||||
if alora_invocation_tokens:
|
||||
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
|
||||
self.gguf_writer.add_key_value(
|
||||
gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS,
|
||||
alora_invocation_tokens,
|
||||
GGUFValueType.ARRAY,
|
||||
GGUFValueType.UINT32,
|
||||
)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||
|
||||
+32
-52
@@ -42,18 +42,6 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- By default, NNPA is disabled by default. To enable it:
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_NNPA=ON
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- For debug builds:
|
||||
|
||||
```bash
|
||||
@@ -164,15 +152,11 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator (WIP)
|
||||
### 2. zDNN Accelerator (WIP)
|
||||
|
||||
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
### 3. Spyre Accelerator
|
||||
|
||||
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
|
||||
|
||||
@@ -230,10 +214,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
5. `-DGGML_NNPA=ON` generates gibberish output
|
||||
|
||||
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
|
||||
|
||||
## Getting Help on IBM Z & LinuxONE
|
||||
|
||||
1. **Bugs, Feature Requests**
|
||||
@@ -258,38 +238,38 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
## Appendix B: SIMD Support Matrix
|
||||
|
||||
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
|
||||
| ---------- | ----------- | ---- | ---- | ----- |
|
||||
| FP32 | ✅ | ✅ | ✅ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
|
||||
| MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q5_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q5_1 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q3_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q4_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q5_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q6_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
|
||||
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
|
||||
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
|
||||
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
|
||||
| | VX/VXE/VXE2 | zDNN | Spyre |
|
||||
|------------|-------------|------|-------|
|
||||
| FP32 | ✅ | ✅ | ❓ |
|
||||
| FP16 | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | ❓ | ❓ |
|
||||
| Q4_0 | ✅ | ❓ | ❓ |
|
||||
| Q4_1 | ✅ | ❓ | ❓ |
|
||||
| MXFP4 | 🚫 | ❓ | ❓ |
|
||||
| Q5_0 | ✅ | ❓ | ❓ |
|
||||
| Q5_1 | ✅ | ❓ | ❓ |
|
||||
| Q8_0 | ✅ | ❓ | ❓ |
|
||||
| Q2_K | 🚫 | ❓ | ❓ |
|
||||
| Q3_K | ✅ | ❓ | ❓ |
|
||||
| Q4_K | ✅ | ❓ | ❓ |
|
||||
| Q5_K | ✅ | ❓ | ❓ |
|
||||
| Q6_K | ✅ | ❓ | ❓ |
|
||||
| TQ1_0 | 🚫 | ❓ | ❓ |
|
||||
| TQ2_0 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XXS | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XS | 🚫 | ❓ | ❓ |
|
||||
| IQ2_S | 🚫 | ❓ | ❓ |
|
||||
| IQ3_XXS | 🚫 | ❓ | ❓ |
|
||||
| IQ3_S | 🚫 | ❓ | ❓ |
|
||||
| IQ1_S | 🚫 | ❓ | ❓ |
|
||||
| IQ1_M | 🚫 | ❓ | ❓ |
|
||||
| IQ4_NL | ✅ | ❓ | ❓ |
|
||||
| IQ4_XS | ✅ | ❓ | ❓ |
|
||||
| FP32->FP16 | 🚫 | ❓ | ❓ |
|
||||
| FP16->FP32 | 🚫 | ❓ | ❓ |
|
||||
|
||||
- ✅ - acceleration available
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
|
||||
|
||||
@@ -333,17 +333,17 @@ static void print_params(struct my_llama_hparams * params) {
|
||||
}
|
||||
|
||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
LOG_INF("%s: Allocating ", __func__);
|
||||
int64_t total = 1;
|
||||
int i = 0;
|
||||
for (; i < ggml_n_dims(t); ++i) {
|
||||
if (i > 0) LOG("x ");
|
||||
LOG("[%" PRId64 "] ", t->ne[i]);
|
||||
if (i > 0) { LOG_INF("x "); }
|
||||
LOG_INF("[%" PRId64 "] ", t->ne[i]);
|
||||
total *= t->ne[i];
|
||||
}
|
||||
if (i > 1) LOG("= [%" PRId64 "] ", total);
|
||||
LOG("float space for %s\n", ggml_get_name(t));
|
||||
if (i > 1) { LOG_INF("= [%" PRId64 "] ", total); }
|
||||
LOG_INF("float space for %s\n", ggml_get_name(t));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ base_model:
|
||||
Recommended way to run this model:
|
||||
|
||||
```sh
|
||||
llama-server -hf {namespace}/{model_name}-GGUF
|
||||
llama-server -hf {namespace}/{model_name}-GGUF --embeddings
|
||||
```
|
||||
|
||||
Then the endpoint can be accessed at http://localhost:8080/embedding, for
|
||||
|
||||
@@ -134,7 +134,6 @@ option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
||||
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
|
||||
@@ -101,7 +101,6 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
|
||||
@@ -1116,30 +1116,65 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
|
||||
namespace {
|
||||
void* g_nz_workspace = nullptr;
|
||||
size_t g_nz_workspace_allocated = 0;
|
||||
/**
|
||||
* @brief Workspace for caching NZ buffers per device.
|
||||
*
|
||||
* This struct manages a device buffer used in NZ computations. It supports
|
||||
* allocation, reallocation, and clearing of cached memory. The struct is
|
||||
* designed to be used with a global array, one per device.
|
||||
*/
|
||||
struct ggml_cann_nz_workspace {
|
||||
void* ptr; // Pointer to allocated device buffer
|
||||
size_t allocated; // Size of currently allocated buffer in bytes
|
||||
|
||||
void release_nz_workspace() {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
g_nz_workspace_allocated = 0;
|
||||
/**
|
||||
* @brief Constructor. Initializes the workspace with no allocated memory.
|
||||
*/
|
||||
ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
|
||||
|
||||
/**
|
||||
* @brief Free cached memory and reset the workspace.
|
||||
*
|
||||
* If a buffer has been allocated, this function releases it using
|
||||
* aclrtFree and resets internal state.
|
||||
*/
|
||||
void clear() {
|
||||
if (ptr) {
|
||||
ACL_CHECK(aclrtFree(ptr));
|
||||
ptr = nullptr;
|
||||
allocated = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void relloc_nz_workspace(size_t new_size) {
|
||||
if (new_size > g_nz_workspace_allocated) {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
/**
|
||||
* @brief Allocate or reallocate the workspace buffer.
|
||||
*
|
||||
* If the requested size is larger than the currently allocated size,
|
||||
* the old buffer will be freed and a new buffer of the requested size
|
||||
* will be allocated on the device.
|
||||
*
|
||||
* @param new_size Size in bytes to allocate for the workspace.
|
||||
*/
|
||||
void realloc(size_t new_size) {
|
||||
if (new_size > allocated) {
|
||||
clear();
|
||||
ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
allocated = new_size;
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
g_nz_workspace_allocated = new_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the device buffer pointer.
|
||||
*
|
||||
* @return Pointer to the allocated buffer, or nullptr if not allocated.
|
||||
*/
|
||||
void* get() const { return ptr; }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Global array of NZ workspaces, one per device.
|
||||
*/
|
||||
static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
|
||||
|
||||
/**
|
||||
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
||||
@@ -1149,13 +1184,13 @@ namespace {
|
||||
* improve performance on certain hardware.
|
||||
*
|
||||
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
||||
* @param data Pointer to the raw data buffer for the tensor weights.
|
||||
* @param offset Byte offset within the tensor data buffer where weights start.
|
||||
* @param device device id.
|
||||
*
|
||||
* @note The workspace buffer used in this function is managed globally and reused
|
||||
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
||||
*/
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
|
||||
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
|
||||
tensor->nb, 2, ACL_FORMAT_ND, offset);
|
||||
uint64_t workspaceSize = 0;
|
||||
@@ -1165,7 +1200,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
|
||||
&workspaceSize, &executor));
|
||||
// Avoid frequent malloc/free of the workspace.
|
||||
relloc_nz_workspace(workspaceSize);
|
||||
g_nz_workspaces[device].realloc(workspaceSize);
|
||||
|
||||
void* g_nz_workspace = g_nz_workspaces[device].get();
|
||||
|
||||
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
||||
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
||||
@@ -1203,7 +1240,7 @@ static void ggml_backend_cann_buffer_set_tensor(
|
||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
weight_format_to_nz(tensor, offset);
|
||||
weight_format_to_nz(tensor, offset, ctx->device);
|
||||
}
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
@@ -2262,7 +2299,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
release_nz_workspace();
|
||||
g_nz_workspaces[cann_ctx->device].clear();
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
|
||||
@@ -457,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
set(GGML_NNPA OFF)
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
@@ -479,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
||||
endif()
|
||||
|
||||
if (GGML_NNPA)
|
||||
message(STATUS "NNPA enabled")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
||||
endif()
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||
message(STATUS "Wasm detected")
|
||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
||||
|
||||
@@ -68,12 +68,6 @@ struct ggml_compute_params {
|
||||
#endif // __VXE2__
|
||||
#endif // __s390x__ && __VEC__
|
||||
|
||||
#if defined(__s390x__) && defined(GGML_NNPA)
|
||||
#ifndef __NNPA__
|
||||
#define __NNPA__
|
||||
#endif // __NNPA__
|
||||
#endif // __s390x__ && GGML_NNPA
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
@@ -3211,21 +3211,6 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#elif defined(__NNPA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
||||
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
||||
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
||||
}
|
||||
#elif defined(__riscv_zvfh)
|
||||
for (int vl; i < n; i += vl) {
|
||||
vl = __riscv_vsetvl_e32m2(n - i);
|
||||
@@ -3259,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||
_mm_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#elif defined(__NNPA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
||||
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
||||
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
||||
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
||||
vec_xst(v_yh, 0, (float *)(y + i));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < n; ++i) {
|
||||
@@ -3477,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_nnpa(void) {
|
||||
#if defined(GGML_NNPA)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_neon(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
||||
return 1;
|
||||
|
||||
@@ -348,8 +348,10 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total = pages * page_size;
|
||||
|
||||
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
|
||||
*free = *total;
|
||||
#endif
|
||||
#endif // _WIN32
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
@@ -576,9 +578,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_vxe()) {
|
||||
features.push_back({ "VXE", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_nnpa()) {
|
||||
features.push_back({ "NNPA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_wasm_simd()) {
|
||||
features.push_back({ "WASM_SIMD", "1" });
|
||||
}
|
||||
|
||||
@@ -154,7 +154,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
return compute_forward_q4_0(params, dst);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
return compute_forward_kv_cache(params, dst);
|
||||
return compute_forward_fp16(params, dst);
|
||||
}
|
||||
} else if (dst->op == GGML_OP_GET_ROWS) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
@@ -164,7 +164,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
@@ -534,13 +534,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
|
||||
op->src[0]->op == GGML_OP_VIEW &&
|
||||
(op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
|
||||
op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[0] != 2) ||
|
||||
(op->src[1]->nb[0] != 4) ||
|
||||
(op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -114,26 +114,6 @@ extern "C" {
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
||||
#elif defined(__NNPA__)
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
uint16x8_t v_h = vec_splats(h);
|
||||
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
||||
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
||||
float32x4_t v_f = vec_splats(f);
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
||||
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
||||
return vec_extract(v_h, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
@@ -1156,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_EPR GGML_F32_EPR
|
||||
|
||||
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
#if defined(__NNPA__)
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
||||
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
||||
return vec_extend_to_fp32_hi(v_xd, 0);
|
||||
#else
|
||||
float tmp[4];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
@@ -1170,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
||||
return vec_xl(0, (const float *)(tmp));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
||||
#if defined(__NNPA__)
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
||||
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
||||
|
||||
x[0] = vec_extract(v_x, 0);
|
||||
x[1] = vec_extract(v_x, 1);
|
||||
x[2] = vec_extract(v_x, 2);
|
||||
x[3] = vec_extract(v_x, 3);
|
||||
#else
|
||||
float arr[4];
|
||||
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
@@ -1193,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define GGML_F16_VEC GGML_F32x4
|
||||
|
||||
@@ -570,6 +570,8 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
//
|
||||
// n/d = (mulhi(n, mp) + n) >> L;
|
||||
static const uint3 init_fastdiv_values(uint32_t d) {
|
||||
GGML_ASSERT(d != 0);
|
||||
|
||||
// compute L = ceil(log2(d));
|
||||
uint32_t L = 0;
|
||||
while (L < 32 && (uint32_t{ 1 } << L) < d) {
|
||||
|
||||
@@ -1,371 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
__shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
|
||||
|
||||
half kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -HALF_MAX_HALF;
|
||||
}
|
||||
half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
|
||||
|
||||
half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ half2 Q_h2[ncols][D/2];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
half kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
|
||||
half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
half sum;
|
||||
if (use_logit_softcap) {
|
||||
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
sum = logit_softcap * tanhf(tmp.x + tmp.y);
|
||||
} else {
|
||||
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
|
||||
const half2 val = h2exp(diff);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
|
||||
KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
|
||||
half2 V_k[(D/2)/WARP_SIZE][2];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
|
||||
V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const half sink = __float2half(sinksf[head]);
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||
kqsum_j = warp_reduce_sum((float)kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
|
||||
if (gridDim.y == 1) {
|
||||
dst_val /= __half2half2(kqsum_j);
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = __half22float2(dst_val);
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int cols_per_block, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,379 +0,0 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
|
||||
__shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
|
||||
|
||||
__shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
|
||||
float2 * KV_tmp2 = (float2 *) KV_tmp;
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
|
||||
float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
__shared__ float Q_f[ncols][D];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
||||
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
||||
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
|
||||
const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
|
||||
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
|
||||
float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
|
||||
float Q_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
|
||||
const int k = k0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(tmp);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
|
||||
float2 V_k[(D/2)/WARP_SIZE];
|
||||
float KQ_k[ncols/nwarps];
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
//Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
|
||||
if (gridDim.y == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
template <int cols_per_block, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -0,0 +1,596 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
|
||||
#define FATTN_TILE_NTHREADS 256
|
||||
|
||||
static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) {
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (fast_fp16_available(cc)) {
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
case 256:
|
||||
return 32;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 32 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : warp_size;
|
||||
case 256:
|
||||
return 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
switch (D) {
|
||||
case 64:
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
switch (D) {
|
||||
case 64:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 64 : 32;
|
||||
case 256:
|
||||
return 32;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // GGML_USE_HIP
|
||||
GGML_UNUSED_VARS(ncols, warp_size);
|
||||
}
|
||||
|
||||
static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols, int warp_size) {
|
||||
#ifdef GGML_USE_HIP
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 2*warp_size : 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 2*warp_size;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
case 256:
|
||||
return ncols <= 16 ? 64 : 128;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
switch (D) {
|
||||
case 64:
|
||||
return 64;
|
||||
case 128:
|
||||
return 128;
|
||||
case 256:
|
||||
return ncols <= 16 ? 128 : 64;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
#endif // GGML_USE_HIP
|
||||
GGML_UNUSED_VARS(ncols, warp_size);
|
||||
}
|
||||
|
||||
template<int D, int ncols, bool use_logit_softcap> // D == head size
|
||||
#ifdef GGML_USE_HIP
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 1)
|
||||
#else
|
||||
__launch_bounds__(FATTN_TILE_NTHREADS, 2)
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_tile(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const char * __restrict__ sinks,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int warp_size = 32;
|
||||
constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size);
|
||||
static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size.");
|
||||
constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size);
|
||||
static_assert(kq_nbatch % (2*warp_size) == 0, "bad kq_nbatch");
|
||||
|
||||
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
|
||||
|
||||
const int sequence = blockIdx.z / ne02;
|
||||
const int head = blockIdx.z - sequence*ne02;
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
||||
const float * sinksf = (const float *) (sinks);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
||||
|
||||
__shared__ float KQ[ncols][kq_stride];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
__shared__ half2 Q_tmp[ncols][D/2];
|
||||
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + 1)]; // Padded to avoid memory bank conflicts.
|
||||
half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#else
|
||||
__shared__ float Q_tmp[ncols][D];
|
||||
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + 1)]; // Padded to avoid memory bank conflicts.
|
||||
float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
|
||||
float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
|
||||
float kqmax[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols/nwarps] = {0.0f};
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale);
|
||||
#else
|
||||
Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale;
|
||||
Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/nwarps; ++j) {
|
||||
kqmax_new[j] = kqmax[j];
|
||||
}
|
||||
|
||||
float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}};
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) {
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
|
||||
const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1 + threadIdx.x] = tmp_h2;
|
||||
#else
|
||||
const float2 tmp_f2 = __half22float2(tmp_h2);
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
|
||||
KV_tmp_f[i_KQ*(kq_nbatch + 1) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; ++k_KQ_1) {
|
||||
half2 K_k[kq_stride/warp_size];
|
||||
half2 Q_k[ncols/nwarps];
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; ++k_KQ_1) {
|
||||
float K_k[kq_stride/warp_size];
|
||||
float Q_k[ncols/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_h2[i_KQ*(kq_nbatch/2 + 1) + k_KQ_1];
|
||||
#else
|
||||
K_k[i_KQ_0/warp_size] = KV_tmp_f [i_KQ*(kq_nbatch + 1) + k_KQ_1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1];
|
||||
#else
|
||||
Q_k[j_KQ_0/nwarps] = Q_tmp[j_KQ][k_KQ_0 + k_KQ_1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]);
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += tmp.x + tmp.y;
|
||||
#else
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (k_KQ_0 + kq_nbatch < D) {
|
||||
__syncthreads(); // Sync not needed on last iteration.
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
|
||||
|
||||
KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
kqmax_new[j0/nwarps] = warp_reduce_max<warp_size>(kqmax_new[j0/nwarps]);
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
|
||||
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
|
||||
|
||||
float kqsum_add = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float diff = KQ[j][i] - kqmax[j0/nwarps];
|
||||
const float val = expf(diff);
|
||||
kqsum_add += val;
|
||||
KQ[j][i] = val;
|
||||
}
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D;
|
||||
static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter");
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) {
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) {
|
||||
const int k_tile = k1 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i];
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
KV_tmp_h2[k_tile*(D/2) + i] = tmp;
|
||||
#else
|
||||
KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp);
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 V_k[(D/2)/warp_size];
|
||||
half2 KQ_k[ncols/nwarps];
|
||||
#else
|
||||
float2 V_k[(D/2)/warp_size];
|
||||
float KQ_k[ncols/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i];
|
||||
#else
|
||||
V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float tmp = KQ[j][k0 + k1];
|
||||
KQ_k[j0/nwarps] = make_half2(tmp, tmp);
|
||||
#else
|
||||
KQ_k[j0/nwarps] = KQ[j][k0 + k1];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps];
|
||||
#else
|
||||
VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps];
|
||||
VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Attention sink: adjust running max and sum once per head
|
||||
if (sinksf && blockIdx.y == 0) {
|
||||
const float sink = sinksf[head];
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
|
||||
kqmax_new_j = warp_reduce_max<warp_size>(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
|
||||
kqmax[j0/nwarps] = kqmax_new_j;
|
||||
|
||||
const float val = expf(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps] += val;
|
||||
}
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
|
||||
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
|
||||
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
|
||||
}
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
float2 * dst2 = (float2 *) dst;
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum<warp_size>(kqsum_j);
|
||||
|
||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D/2; i00 += warp_size) {
|
||||
const int i0 = i00 + threadIdx.x;
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]);
|
||||
#else
|
||||
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
if (gridDim.y == 1) {
|
||||
dst_val.x /= kqsum_j;
|
||||
dst_val.y /= kqsum_j;
|
||||
}
|
||||
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
|
||||
}
|
||||
|
||||
if (gridDim.y != 1 && threadIdx.x == 0) {
|
||||
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
ne00, ne01, ne02, ne03,
|
||||
nb01, nb02, nb03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
ne31, ne32, ne33,
|
||||
nb31, nb32, nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
template <int D, bool use_logit_softcap>
|
||||
static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = 32;
|
||||
const int nwarps = FATTN_TILE_NTHREADS / warp_size;
|
||||
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
|
||||
if (Q->ne[1] > 16) {
|
||||
constexpr int cols_per_block = 32;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 16;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
|
||||
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
|
||||
}
|
||||
|
||||
template <bool use_logit_softcap>
|
||||
static void launch_fattn_tile_switch_head_size(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
launch_fattn_tile_switch_ncols< 64, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
case 128: {
|
||||
launch_fattn_tile_switch_ncols<128, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
case 256: {
|
||||
launch_fattn_tile_switch_ncols<256, use_logit_softcap>(ctx, dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("Unsupported head size");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -1,8 +1,7 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-mma-f16.cuh"
|
||||
#include "fattn-tile-f16.cuh"
|
||||
#include "fattn-tile-f32.cuh"
|
||||
#include "fattn-tile.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
@@ -271,8 +270,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
|
||||
// Best FlashAttention kernel for a specific GPU:
|
||||
enum best_fattn_kernel {
|
||||
BEST_FATTN_KERNEL_NONE = 0,
|
||||
BEST_FATTN_KERNEL_TILE_F32 = 200,
|
||||
BEST_FATTN_KERNEL_TILE_F16 = 210,
|
||||
BEST_FATTN_KERNEL_TILE = 200,
|
||||
BEST_FATTN_KERNEL_VEC_F32 = 100,
|
||||
BEST_FATTN_KERNEL_VEC_F16 = 110,
|
||||
BEST_FATTN_KERNEL_WMMA_F16 = 300,
|
||||
@@ -411,10 +409,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
}
|
||||
|
||||
// If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes:
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
return BEST_FATTN_KERNEL_TILE_F16;
|
||||
}
|
||||
return BEST_FATTN_KERNEL_TILE_F32;
|
||||
return BEST_FATTN_KERNEL_TILE;
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -422,11 +417,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
|
||||
case BEST_FATTN_KERNEL_NONE:
|
||||
GGML_ABORT("fatal error");
|
||||
case BEST_FATTN_KERNEL_TILE_F32:
|
||||
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_TILE_F16:
|
||||
ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
|
||||
case BEST_FATTN_KERNEL_TILE:
|
||||
ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
||||
break;
|
||||
case BEST_FATTN_KERNEL_VEC_F32:
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
|
||||
+53
-67
@@ -141,9 +141,10 @@ template <ggml_type type, int ncols_dst>
|
||||
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
|
||||
const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
|
||||
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
|
||||
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
@@ -161,12 +162,12 @@ static __global__ void mul_mat_vec_q(
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
|
||||
const int channel_dst = blockIdx.y;
|
||||
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
|
||||
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int sample_x = sample_dst / sample_ratio;
|
||||
const int sample_y = sample_dst;
|
||||
const uint32_t channel_dst = blockIdx.y;
|
||||
const uint32_t channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
const uint32_t channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
const uint32_t sample_dst = blockIdx.z;
|
||||
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
|
||||
const uint32_t sample_y = sample_dst;
|
||||
|
||||
// partial sum for each thread
|
||||
float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
|
||||
@@ -247,8 +248,9 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
|
||||
|
||||
const int channel_ratio = nchannels_dst / nchannels_x;
|
||||
const int sample_ratio = nsamples_dst / nsamples_x;
|
||||
const uint3 nchannels_y_fd = ids ? init_fastdiv_values(nchannels_y) : make_uint3(0, 0, 0);
|
||||
const uint3 channel_ratio_fd = ids ? make_uint3(0, 0, 0) : init_fastdiv_values(nchannels_dst / nchannels_x);
|
||||
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
@@ -256,86 +258,70 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
{
|
||||
case 1: {
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 2: {
|
||||
constexpr int c_ncols_dst = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 3: {
|
||||
constexpr int c_ncols_dst = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 4: {
|
||||
constexpr int c_ncols_dst = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 5: {
|
||||
constexpr int c_ncols_dst = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 6:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 6: {
|
||||
constexpr int c_ncols_dst = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 7:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 7: {
|
||||
constexpr int c_ncols_dst = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
case 8:
|
||||
{
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
case 8: {
|
||||
constexpr int c_ncols_dst = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
break;
|
||||
}
|
||||
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
|
||||
@@ -1,26 +1,27 @@
|
||||
#include "quantize.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
__launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1)
|
||||
static __global__ void quantize_q8_1(
|
||||
const float * __restrict__ x, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int ne1, const int ne2) {
|
||||
const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
|
||||
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i3 = fastdiv(blockIdx.z, ne2);
|
||||
const int64_t i2 = blockIdx.z - i3*ne2.z;
|
||||
const int64_t i1 = blockIdx.y;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t & i00 = i0;
|
||||
const int64_t & i01 = i1;
|
||||
const int64_t & i02 = i2;
|
||||
const int64_t & i03 = i3;
|
||||
|
||||
const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
|
||||
const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
@@ -31,10 +32,10 @@ static __global__ void quantize_q8_1(
|
||||
float amax = fabsf(xi);
|
||||
float sum = xi;
|
||||
|
||||
amax = warp_reduce_max(amax);
|
||||
sum = warp_reduce_sum(sum);
|
||||
amax = warp_reduce_max<QK8_1>(amax);
|
||||
sum = warp_reduce_sum<QK8_1>(sum);
|
||||
|
||||
const float d = amax / 127;
|
||||
const float d = amax / 127.0f;
|
||||
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||
|
||||
y[ib].qs[iqs] = q;
|
||||
@@ -43,8 +44,7 @@ static __global__ void quantize_q8_1(
|
||||
return;
|
||||
}
|
||||
|
||||
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
y[ib].ds = make_half2(d, sum);
|
||||
}
|
||||
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
@@ -152,10 +152,12 @@ void quantize_row_q8_1_cuda(
|
||||
GGML_ASSERT(!ids);
|
||||
GGML_ASSERT(ne0 % QK8_1 == 0);
|
||||
|
||||
const uint3 ne2_fastdiv = init_fastdiv_values(ne2);
|
||||
|
||||
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
|
||||
GGML_UNUSED(type_src0);
|
||||
}
|
||||
|
||||
|
||||
@@ -407,6 +407,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_10,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,
|
||||
@@ -1439,6 +1440,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4, mul_mm_id_map0_f16_ne20_4, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6, mul_mm_id_map0_f16_ne20_6, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8, mul_mm_id_map0_f16_ne20_8, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_10, mul_mm_id_map0_f16_ne20_10, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16, mul_mm_id_map0_f16_ne20_16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, mul_mm_id_f32_f16, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, mul_mm_id_f16_f16, has_simdgroup_mm);
|
||||
@@ -3979,6 +3981,7 @@ static int ggml_metal_encode_node(
|
||||
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4 ].pipeline; break;
|
||||
case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6 ].pipeline; break;
|
||||
case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8 ].pipeline; break;
|
||||
case 10: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_10].pipeline; break;
|
||||
case 16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16].pipeline; break;
|
||||
default: GGML_ABORT("missing specialization for ne20 = %d", (int) ne20);
|
||||
}
|
||||
|
||||
@@ -7618,6 +7618,7 @@ template [[host_name("kernel_mul_mm_id_map0_f16_ne20_2" )]] kernel kernel_mul_mm
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_f16_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
|
||||
|
||||
template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
|
||||
|
||||
@@ -1154,17 +1154,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
||||
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
auto callback =
|
||||
[](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message, void * userdata) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
*static_cast<wgpu::Adapter *>(userdata) = std::move(adapter);
|
||||
};
|
||||
void * userdata = &ctx->adapter;
|
||||
ctx->instance.WaitAny(
|
||||
ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous, callback, userdata), UINT64_MAX);
|
||||
ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->adapter = std::move(adapter);
|
||||
}), UINT64_MAX);
|
||||
GGML_ASSERT(ctx->adapter != nullptr);
|
||||
|
||||
ctx->adapter.GetLimits(&ctx->limits);
|
||||
|
||||
+102
-27
@@ -1166,50 +1166,51 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
|
||||
ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
|
||||
}
|
||||
|
||||
struct gguf_writer {
|
||||
std::vector<int8_t> & buf;
|
||||
struct gguf_writer_base {
|
||||
size_t written_bytes {0u};
|
||||
|
||||
gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
|
||||
~gguf_writer_base(void) {}
|
||||
|
||||
// we bet on devirtualization
|
||||
virtual void write(int8_t val) = 0;
|
||||
virtual void write(const std::vector<int8_t> & val) = 0;
|
||||
virtual void write_tensor_data(const struct gguf_tensor_info & info, size_t offset_data, size_t alignment) = 0;
|
||||
|
||||
template <typename T>
|
||||
void write(const T & val) const {
|
||||
void write(const T & val) {
|
||||
for (size_t i = 0; i < sizeof(val); ++i) {
|
||||
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
|
||||
write(reinterpret_cast<const int8_t *>(&val)[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void write(const std::vector<int8_t> & val) const {
|
||||
buf.insert(buf.end(), val.begin(), val.end());
|
||||
}
|
||||
|
||||
void write(const bool & val) const {
|
||||
void write(const bool & val) {
|
||||
const int8_t val8 = val ? 1 : 0;
|
||||
write(val8);
|
||||
}
|
||||
|
||||
void write(const std::string & val) const {
|
||||
void write(const std::string & val) {
|
||||
{
|
||||
const uint64_t n = val.length();
|
||||
write(n);
|
||||
}
|
||||
for (size_t i = 0; i < val.length(); ++i) {
|
||||
buf.push_back(reinterpret_cast<const int8_t *>(val.data())[i]);
|
||||
write((val.data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void write(const char * val) const {
|
||||
void write(const char * val) {
|
||||
write(std::string(val));
|
||||
}
|
||||
|
||||
void write(const enum ggml_type & val) const {
|
||||
void write(const enum ggml_type & val) {
|
||||
write(int32_t(val));
|
||||
}
|
||||
|
||||
void write(const enum gguf_type & val) const {
|
||||
void write(const enum gguf_type & val) {
|
||||
write(int32_t(val));
|
||||
}
|
||||
|
||||
void write(const struct gguf_kv & kv) const {
|
||||
void write(const struct gguf_kv & kv) {
|
||||
const uint64_t ne = kv.get_ne();
|
||||
|
||||
write(kv.get_key());
|
||||
@@ -1250,7 +1251,7 @@ struct gguf_writer {
|
||||
}
|
||||
}
|
||||
|
||||
void write_tensor_meta(const struct gguf_tensor_info & info) const {
|
||||
void write_tensor_meta(const struct gguf_tensor_info & info) {
|
||||
write(info.t.name);
|
||||
|
||||
const uint32_t n_dims = ggml_n_dims(&info.t);
|
||||
@@ -1263,14 +1264,33 @@ struct gguf_writer {
|
||||
write(info.offset);
|
||||
}
|
||||
|
||||
void pad(const size_t alignment) const {
|
||||
while (buf.size() % alignment != 0) {
|
||||
void pad(const size_t alignment) {
|
||||
while (written_bytes % alignment != 0) {
|
||||
const int8_t zero = 0;
|
||||
write(zero);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) const {
|
||||
// vector buffer based writer
|
||||
struct gguf_writer_buf final : public gguf_writer_base {
|
||||
std::vector<int8_t> & buf;
|
||||
|
||||
gguf_writer_buf(std::vector<int8_t> & buf) : buf(buf) {}
|
||||
|
||||
using gguf_writer_base::write;
|
||||
|
||||
void write(const int8_t val) override {
|
||||
buf.push_back(val);
|
||||
written_bytes++;
|
||||
}
|
||||
|
||||
void write(const std::vector<int8_t> & val) override {
|
||||
buf.insert(buf.end(), val.begin(), val.end());
|
||||
written_bytes += val.size();
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) override {
|
||||
GGML_ASSERT(buf.size() - offset_data == info.offset);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(&info.t));
|
||||
@@ -1284,14 +1304,58 @@ struct gguf_writer {
|
||||
GGML_ASSERT(info.t.data);
|
||||
memcpy(buf.data() + offset, info.t.data, nbytes);
|
||||
}
|
||||
written_bytes += nbytes;
|
||||
|
||||
pad(alignment);
|
||||
}
|
||||
};
|
||||
|
||||
void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
|
||||
const struct gguf_writer gw(buf);
|
||||
// file based writer
|
||||
struct gguf_writer_file final : public gguf_writer_base {
|
||||
FILE * file;
|
||||
|
||||
gguf_writer_file(FILE* file) : file(file) {}
|
||||
|
||||
using gguf_writer_base::write;
|
||||
|
||||
void write(const int8_t val) override {
|
||||
const auto real_val = static_cast<uint8_t>(val);
|
||||
const auto ret = fputc(real_val, file);
|
||||
written_bytes++;
|
||||
if (ret != real_val) {
|
||||
throw std::runtime_error("unexpected fputc result '" + std::to_string(ret) + "' instead of '" + std::to_string((int)real_val) + "'");
|
||||
}
|
||||
}
|
||||
|
||||
void write(const std::vector<int8_t> & val) override {
|
||||
const auto ret = fwrite(val.data(), 1, val.size(), file);
|
||||
written_bytes += val.size();
|
||||
if (ret != val.size()) {
|
||||
throw std::runtime_error("unexpected fwrite number of bytes written, '" + std::to_string(ret) + "' instead of '" + std::to_string(val.size()) + "'");
|
||||
}
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) override {
|
||||
GGML_ASSERT(written_bytes - offset_data == info.offset);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(&info.t));
|
||||
const size_t nbytes = ggml_nbytes(&info.t);
|
||||
|
||||
std::vector<int8_t> buf(nbytes);
|
||||
if (info.t.buffer) {
|
||||
ggml_backend_tensor_get(&info.t, buf.data(), 0, nbytes);
|
||||
} else {
|
||||
GGML_ASSERT(info.t.data);
|
||||
memcpy(buf.data(), info.t.data, nbytes);
|
||||
}
|
||||
write(buf);
|
||||
|
||||
pad(alignment);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename writer_t>
|
||||
static void gguf_write_out(const struct gguf_context * ctx, writer_t & gw, bool only_meta) {
|
||||
const int64_t n_kv = gguf_get_n_kv(ctx);
|
||||
const int64_t n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
@@ -1321,7 +1385,7 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t offset_data = gw.buf.size();
|
||||
const size_t offset_data = gw.written_bytes;
|
||||
|
||||
// write tensor data
|
||||
for (int64_t i = 0; i < n_tensors; ++i) {
|
||||
@@ -1329,6 +1393,11 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
|
||||
}
|
||||
}
|
||||
|
||||
void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
|
||||
gguf_writer_buf gw(buf);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
}
|
||||
|
||||
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = ggml_fopen(fname, "wb");
|
||||
|
||||
@@ -1337,11 +1406,17 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<int8_t> buf;
|
||||
gguf_write_to_buf(ctx, buf, only_meta);
|
||||
const bool ok = fwrite(buf.data(), 1, buf.size(), file) == buf.size();
|
||||
try {
|
||||
gguf_writer_file gw(file);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
} catch (const std::runtime_error& ex) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return ok;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
||||
|
||||
@@ -231,10 +231,11 @@ class Keys:
|
||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
LORA_TASK_NAME = "adapter.lora.task_name"
|
||||
LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
LORA_TASK_NAME = "adapter.lora.task_name"
|
||||
LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
|
||||
ALORA_INVOCATION_TOKENS = "adapter.alora.invocation_tokens"
|
||||
|
||||
class IMatrix:
|
||||
CHUNK_COUNT = "imatrix.chunk_count"
|
||||
@@ -340,6 +341,7 @@ class MODEL_ARCH(IntEnum):
|
||||
GEMMA2 = auto()
|
||||
GEMMA3 = auto()
|
||||
GEMMA3N = auto()
|
||||
GEMMA_EMBEDDING = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
RWKV6QWEN2 = auto()
|
||||
@@ -674,6 +676,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.GEMMA3: "gemma3",
|
||||
MODEL_ARCH.GEMMA3N: "gemma3n",
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
||||
@@ -1719,6 +1722,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.LAUREL_R,
|
||||
MODEL_TENSOR.LAUREL_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.STARCODER2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -14,6 +14,7 @@ class TensorNameMap:
|
||||
"transformer.word_embeddings", # falcon
|
||||
"word_embeddings", # bloom
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
|
||||
"embed_tokens", # embeddinggemma
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert nomic-bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
@@ -141,6 +142,7 @@ class TensorNameMap:
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||
"model.layers.{bid}.ln1", # rwkv7
|
||||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
"layers.{bid}.input_layernorm", # embeddinggemma
|
||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||
"model.layers.{bid}.operator_norm", # lfm2
|
||||
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||
@@ -179,6 +181,7 @@ class TensorNameMap:
|
||||
# Attention query
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||
"layers.{bid}.self_attn.q_proj", # embeddinggemma
|
||||
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
@@ -197,6 +200,7 @@ class TensorNameMap:
|
||||
# Attention key
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||
"layers.{bid}.self_attn.k_proj", # embeddinggemma
|
||||
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
@@ -216,6 +220,7 @@ class TensorNameMap:
|
||||
# Attention value
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||
"layers.{bid}.self_attn.v_proj", # embeddinggemma
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.layer.{bid}.attention.v_lin", # distillbert
|
||||
@@ -239,6 +244,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"h.{bid}.self_attention.dense", # bloom
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
|
||||
"layers.{bid}.self_attn.o_proj", # embeddinggemma
|
||||
"model.layers.{bid}.self_attn.out_proj", # lfm2
|
||||
"model.layers.{bid}.self_attn.linear_attn", # deci
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
@@ -277,6 +283,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
||||
"layers.{bid}.post_attention_layernorm", # embeddinggemma
|
||||
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
||||
"model.layers.layers.{bid}.post_mixer_norm.weight", # plamo2
|
||||
),
|
||||
@@ -320,12 +327,14 @@ class TensorNameMap:
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||
"layers.{bid}.pre_feedforward_layernorm", # embeddinggemma
|
||||
"model.layers.{bid}.pre_ff_layernorm.weight",
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
"layers.{bid}.post_feedforward_layernorm", # embeddinggemma
|
||||
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
||||
"model.layers.layers.{bid}.post_mlp_norm.weight", # plamo2
|
||||
"model.layers.{bid}.feed_forward.up_proj",
|
||||
@@ -362,6 +371,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
||||
"layers.{bid}.mlp.up_proj", # embeddinggemma
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
||||
@@ -421,6 +431,7 @@ class TensorNameMap:
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
|
||||
"layers.{bid}.mlp.gate_proj", # embeddinggemma
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"transformer.h.{bid}.mlp.c_fc2", # jais
|
||||
@@ -461,6 +472,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
||||
"layers.{bid}.mlp.down_proj", # embeddinggemma
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
||||
@@ -513,6 +525,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.query_layernorm", # hunyuan
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
|
||||
"layers.{bid}.self_attn.q_norm", # embeddinggemma
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
@@ -525,6 +538,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.key_layernorm", # hunyuan
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
|
||||
"layers.{bid}.self_attn.k_norm", # embeddinggemma
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
|
||||
@@ -583,6 +583,10 @@ extern "C" {
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
||||
|
||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
{%- set ns = namespace(enable_thinking=true) -%}
|
||||
{%- for message in messages -%}
|
||||
{%- set content = message['content'] -%}
|
||||
{%- if message['role'] == 'user' or message['role'] == 'system' -%}
|
||||
{%- if '/think' in content -%}
|
||||
{%- set ns.enable_thinking = true -%}
|
||||
{%- elif '/no_think' in content -%}
|
||||
{%- set ns.enable_thinking = false -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if messages[0]['role'] != 'system' -%}
|
||||
{%- set ns.non_tool_system_content = '' -%}
|
||||
{{- '<SPECIAL_10>System
|
||||
' -}}
|
||||
{%- else -%}
|
||||
{%- set ns.non_tool_system_content = (messages[0]['content'] | default('', true)).replace('/think', '').replace('/no_think', '').strip() -%}
|
||||
{{- '<SPECIAL_10>System
|
||||
' + ns.non_tool_system_content }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if tools -%}
|
||||
{%- if ns.non_tool_system_content is defined and ns.non_tool_system_content != '' -%}
|
||||
{{- '
|
||||
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{{- 'You can use the following tools to assist the user if required:' -}}
|
||||
{{- '
|
||||
<AVAILABLE_TOOLS>[' -}}
|
||||
{%- for tool in tools -%}
|
||||
{{- (tool.function if tool.function is defined else tool) | tojson -}}
|
||||
{{- ', ' if not loop.last else '' -}}
|
||||
{%- endfor -%}
|
||||
{{- ']</AVAILABLE_TOOLS>
|
||||
|
||||
' -}}
|
||||
{{- 'If you decide to call any tool(s), use the following format:
|
||||
' -}}
|
||||
{{- '<TOOLCALL>[{{"name": "tool_name1", "arguments": "tool_args1"}}, ' -}}
|
||||
{{- '{{"name": "tool_name2", "arguments": "tool_args2"}}]</TOOLCALL>
|
||||
|
||||
' -}}
|
||||
{{- 'The user will execute tool-calls and return responses from tool(s) in this format:
|
||||
' -}}
|
||||
{{- '<TOOL_RESPONSE>[{{"tool_response1"}}, {{"tool_response2"}}]</TOOL_RESPONSE>
|
||||
|
||||
' -}}
|
||||
{{- 'Based on the tool responses, you can call additional tools if needed, correct tool calls if any errors are found, or just respond to the user.' -}}
|
||||
{%- endif -%}
|
||||
{{- '
|
||||
|
||||
' -}}
|
||||
{%- set messages = messages[1:] if messages[0]['role'] == 'system' else messages -%}
|
||||
{%- if messages[-1]['role'] == 'assistant' -%}
|
||||
{%- set ns.last_turn_assistant_content = (messages[-1]['content'] | default('', true)).strip() -%}
|
||||
{%- set ns.last_turn_assistant_tool_calls = messages[-1]['tool_calls'] if 'tool_calls' in messages[-1] else [] -%}
|
||||
{%- set messages = messages[:-1] -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- set content = message['content'] %}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{- '<SPECIAL_11>User
|
||||
' + (content | default('', true)).replace('/think', '').replace('/no_think', '').strip() + '
|
||||
' }}
|
||||
{%- elif message['role'] == 'tool' -%}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}
|
||||
{{- '<SPECIAL_11>User
|
||||
' + '<TOOL_RESPONSE>[' }}
|
||||
{%- endif -%}
|
||||
{{- message['content'] -}}
|
||||
{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == 'tool') else '' -}}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}
|
||||
{{- ']</TOOL_RESPONSE>' -}}
|
||||
{%- endif -%}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{%- if content and '</think>' in content -%}
|
||||
{%- set content = (content.split('</think>')[1] | default('', true)).strip() %}
|
||||
{%- endif -%}
|
||||
{{- '<SPECIAL_11>Assistant
|
||||
' + ((content | default('', true)).strip() if content is not none else '') }}
|
||||
{%- if message.tool_calls -%}
|
||||
{%- if (content | default('', true)).strip() != '' -%}
|
||||
{{- '
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{{- '<TOOLCALL>[' -}}
|
||||
{%- for call in message.tool_calls -%}
|
||||
{%- set fn = call.function if call.function is defined else call -%}
|
||||
{{- '{"name": "' + fn.name + '", "arguments": ' -}}
|
||||
{%- if fn.arguments is string -%}
|
||||
{{- fn.arguments -}}
|
||||
{%- else -%}
|
||||
{{- fn.arguments | tojson -}}
|
||||
{%- endif -%}
|
||||
{{- '}' + (', ' if not loop.last else '') -}}
|
||||
{%- endfor -%}
|
||||
{{- ']</TOOLCALL>' -}}
|
||||
{%- endif -%}
|
||||
{{- '
|
||||
<SPECIAL_12>
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- '<SPECIAL_11>Assistant
|
||||
' -}}
|
||||
{%- if ns.enable_thinking is defined and ns.enable_thinking is false -%}
|
||||
{{- '<think></think>' -}}
|
||||
{%- else -%}
|
||||
{{- '<think>
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{%- if ns.last_turn_assistant_content is defined and ns.last_turn_assistant_content != '' -%}
|
||||
{{- ns.last_turn_assistant_content -}}
|
||||
{%- endif -%}
|
||||
{%- else -%}
|
||||
{%- if ns.last_turn_assistant_content is defined and ns.last_turn_assistant_content != '' -%}
|
||||
{{- '<SPECIAL_11>Assistant
|
||||
' -}}
|
||||
{%- if ns.enable_thinking is defined and ns.enable_thinking is false -%}
|
||||
{{- '<think></think>' -}}
|
||||
{%- else -%}
|
||||
{{- '<think>
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{{- ns.last_turn_assistant_content -}}
|
||||
{%- if continue_final_message is defined -%}
|
||||
{%- if continue_final_message is false -%}
|
||||
{{- '
|
||||
<SPECIAL_12>
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{%- else -%}
|
||||
{{- '
|
||||
<SPECIAL_12>
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- if ns.last_turn_assistant_tool_calls is defined and ns.last_turn_assistant_tool_calls | length > 0 -%}
|
||||
{{- '<SPECIAL_11>Assistant
|
||||
' -}}
|
||||
{{- '<TOOLCALL>[' -}}
|
||||
{%- for call in ns.last_turn_assistant_tool_calls -%}
|
||||
{%- set fn = call.function if call.function is defined else call -%}
|
||||
{{- '{"name": "' + fn.name + '", "arguments": ' -}}
|
||||
{%- if fn.arguments is string -%}
|
||||
{{- fn.arguments -}}
|
||||
{%- else -%}
|
||||
{{- fn.arguments | tojson -}}
|
||||
{%- endif -%}
|
||||
{{- '}' + (', ' if not loop.last else '') -}}
|
||||
{%- endfor -%}
|
||||
{{- ']</TOOLCALL>' -}}
|
||||
{{- '<SPECIAL_12>
|
||||
|
||||
' -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
Executable
+504
@@ -0,0 +1,504 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import jinja2.ext as jinja2_ext
|
||||
from PySide6.QtWidgets import (
|
||||
QApplication,
|
||||
QMainWindow,
|
||||
QWidget,
|
||||
QVBoxLayout,
|
||||
QHBoxLayout,
|
||||
QLabel,
|
||||
QPlainTextEdit,
|
||||
QTextEdit,
|
||||
QPushButton,
|
||||
QFileDialog,
|
||||
)
|
||||
from PySide6.QtGui import QColor, QColorConstants, QTextCursor, QTextFormat
|
||||
from PySide6.QtCore import Qt, QRect, QSize
|
||||
from jinja2 import TemplateSyntaxError
|
||||
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def format_template_content(template_content):
|
||||
"""Format the Jinja template content using Jinja2's lexer."""
|
||||
if not template_content.strip():
|
||||
return template_content
|
||||
|
||||
env = ImmutableSandboxedEnvironment()
|
||||
tc_rstrip = template_content.rstrip()
|
||||
tokens = list(env.lex(tc_rstrip))
|
||||
result = ""
|
||||
indent_level = 0
|
||||
i = 0
|
||||
|
||||
while i < len(tokens):
|
||||
token = tokens[i]
|
||||
_, token_type, token_value = token
|
||||
|
||||
if token_type == "block_begin":
|
||||
block_start = i
|
||||
# Collect all tokens for this block construct
|
||||
construct_content = token_value
|
||||
end_token_type = token_type.replace("_begin", "_end")
|
||||
j = i + 1
|
||||
while j < len(tokens) and tokens[j][1] != end_token_type:
|
||||
construct_content += tokens[j][2]
|
||||
j += 1
|
||||
|
||||
if j < len(tokens): # Found the end token
|
||||
construct_content += tokens[j][2]
|
||||
i = j # Skip to the end token
|
||||
|
||||
# Check for control structure keywords for indentation
|
||||
stripped_content = construct_content.strip()
|
||||
instr = block_start + 1
|
||||
while tokens[instr][1] == "whitespace":
|
||||
instr = instr + 1
|
||||
|
||||
instruction_token = tokens[instr][2]
|
||||
start_control_tokens = ["if", "for", "macro", "call", "block"]
|
||||
end_control_tokens = ["end" + t for t in start_control_tokens]
|
||||
is_control_start = any(
|
||||
instruction_token.startswith(kw) for kw in start_control_tokens
|
||||
)
|
||||
is_control_end = any(
|
||||
instruction_token.startswith(kw) for kw in end_control_tokens
|
||||
)
|
||||
|
||||
# Adjust indentation for control structures
|
||||
# For control end blocks, decrease indent BEFORE adding the content
|
||||
if is_control_end:
|
||||
indent_level = max(0, indent_level - 1)
|
||||
|
||||
# Remove all previous whitespace before this block
|
||||
result = result.rstrip()
|
||||
|
||||
# Add proper indent, but only if this is not the first token
|
||||
added_newline = False
|
||||
if result: # Only add newline and indent if there's already content
|
||||
result += (
|
||||
"\n" + " " * indent_level
|
||||
) # Use 2 spaces per indent level
|
||||
added_newline = True
|
||||
else: # For the first token, don't add any indent
|
||||
result += ""
|
||||
|
||||
# Add the block content
|
||||
result += stripped_content
|
||||
|
||||
# Add '-' after '%' if it wasn't there and we added a newline or indent
|
||||
if (
|
||||
added_newline
|
||||
and stripped_content.startswith("{%")
|
||||
and not stripped_content.startswith("{%-")
|
||||
):
|
||||
# Add '-' at the beginning
|
||||
result = (
|
||||
result[: result.rfind("{%")]
|
||||
+ "{%-"
|
||||
+ result[result.rfind("{%") + 2 :]
|
||||
)
|
||||
if stripped_content.endswith("%}") and not stripped_content.endswith(
|
||||
"-%}"
|
||||
):
|
||||
# Only add '-' if this is not the last token or if there's content after
|
||||
if i + 1 < len(tokens) and tokens[i + 1][1] != "eof":
|
||||
result = result[:-2] + "-%}"
|
||||
|
||||
# For control start blocks, increase indent AFTER adding the content
|
||||
if is_control_start:
|
||||
indent_level += 1
|
||||
else:
|
||||
# Malformed template, just add the token
|
||||
result += token_value
|
||||
elif token_type == "variable_begin":
|
||||
# Collect all tokens for this variable construct
|
||||
construct_content = token_value
|
||||
end_token_type = token_type.replace("_begin", "_end")
|
||||
j = i + 1
|
||||
while j < len(tokens) and tokens[j][1] != end_token_type:
|
||||
construct_content += tokens[j][2]
|
||||
j += 1
|
||||
|
||||
if j < len(tokens): # Found the end token
|
||||
construct_content += tokens[j][2]
|
||||
i = j # Skip to the end token
|
||||
|
||||
# For variable constructs, leave them alone
|
||||
# Do not add indent or whitespace before or after them
|
||||
result += construct_content
|
||||
else:
|
||||
# Malformed template, just add the token
|
||||
result += token_value
|
||||
elif token_type == "data":
|
||||
# Handle data (text between Jinja constructs)
|
||||
# For data content, preserve it as is
|
||||
result += token_value
|
||||
else:
|
||||
# Handle any other tokens
|
||||
result += token_value
|
||||
|
||||
i += 1
|
||||
|
||||
# Clean up trailing newlines and spaces
|
||||
result = result.rstrip()
|
||||
|
||||
# Copy the newline / space count from the original
|
||||
if (trailing_length := len(template_content) - len(tc_rstrip)):
|
||||
result += template_content[-trailing_length:]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ------------------------
|
||||
# Line Number Widget
|
||||
# ------------------------
|
||||
class LineNumberArea(QWidget):
|
||||
def __init__(self, editor):
|
||||
super().__init__(editor)
|
||||
self.code_editor = editor
|
||||
|
||||
def sizeHint(self):
|
||||
return QSize(self.code_editor.line_number_area_width(), 0)
|
||||
|
||||
def paintEvent(self, event):
|
||||
self.code_editor.line_number_area_paint_event(event)
|
||||
|
||||
|
||||
class CodeEditor(QPlainTextEdit):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.line_number_area = LineNumberArea(self)
|
||||
|
||||
self.blockCountChanged.connect(self.update_line_number_area_width)
|
||||
self.updateRequest.connect(self.update_line_number_area)
|
||||
self.cursorPositionChanged.connect(self.highlight_current_line)
|
||||
|
||||
self.update_line_number_area_width(0)
|
||||
self.highlight_current_line()
|
||||
|
||||
def line_number_area_width(self):
|
||||
digits = len(str(self.blockCount()))
|
||||
space = 3 + self.fontMetrics().horizontalAdvance("9") * digits
|
||||
return space
|
||||
|
||||
def update_line_number_area_width(self, _):
|
||||
self.setViewportMargins(self.line_number_area_width(), 0, 0, 0)
|
||||
|
||||
def update_line_number_area(self, rect, dy):
|
||||
if dy:
|
||||
self.line_number_area.scroll(0, dy)
|
||||
else:
|
||||
self.line_number_area.update(
|
||||
0, rect.y(), self.line_number_area.width(), rect.height()
|
||||
)
|
||||
|
||||
if rect.contains(self.viewport().rect()):
|
||||
self.update_line_number_area_width(0)
|
||||
|
||||
def resizeEvent(self, event):
|
||||
super().resizeEvent(event)
|
||||
cr = self.contentsRect()
|
||||
self.line_number_area.setGeometry(
|
||||
QRect(cr.left(), cr.top(), self.line_number_area_width(), cr.height())
|
||||
)
|
||||
|
||||
def line_number_area_paint_event(self, event):
|
||||
from PySide6.QtGui import QPainter
|
||||
|
||||
painter = QPainter(self.line_number_area)
|
||||
painter.fillRect(event.rect(), QColorConstants.LightGray)
|
||||
|
||||
block = self.firstVisibleBlock()
|
||||
block_number = block.blockNumber()
|
||||
top = int(
|
||||
self.blockBoundingGeometry(block).translated(self.contentOffset()).top()
|
||||
)
|
||||
bottom = top + int(self.blockBoundingRect(block).height())
|
||||
|
||||
while block.isValid() and top <= event.rect().bottom():
|
||||
if block.isVisible() and bottom >= event.rect().top():
|
||||
number = str(block_number + 1)
|
||||
painter.setPen(QColorConstants.Black)
|
||||
painter.drawText(
|
||||
0,
|
||||
top,
|
||||
self.line_number_area.width() - 2,
|
||||
self.fontMetrics().height(),
|
||||
Qt.AlignmentFlag.AlignRight,
|
||||
number,
|
||||
)
|
||||
block = block.next()
|
||||
top = bottom
|
||||
bottom = top + int(self.blockBoundingRect(block).height())
|
||||
block_number += 1
|
||||
|
||||
def highlight_current_line(self):
|
||||
extra_selections = []
|
||||
if not self.isReadOnly():
|
||||
selection = QTextEdit.ExtraSelection()
|
||||
line_color = QColorConstants.Yellow.lighter(160)
|
||||
selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue]
|
||||
selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue]
|
||||
selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue]
|
||||
selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue]
|
||||
extra_selections.append(selection)
|
||||
self.setExtraSelections(extra_selections)
|
||||
|
||||
def highlight_position(self, lineno: int, col: int, color: QColor):
|
||||
block = self.document().findBlockByLineNumber(lineno - 1)
|
||||
if block.isValid():
|
||||
cursor = QTextCursor(block)
|
||||
text = block.text()
|
||||
start = block.position() + max(0, col - 1)
|
||||
cursor.setPosition(start)
|
||||
if col <= len(text):
|
||||
cursor.movePosition(
|
||||
QTextCursor.MoveOperation.NextCharacter,
|
||||
QTextCursor.MoveMode.KeepAnchor,
|
||||
)
|
||||
|
||||
extra = QTextEdit.ExtraSelection()
|
||||
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue]
|
||||
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue]
|
||||
|
||||
self.setExtraSelections(self.extraSelections() + [extra])
|
||||
|
||||
def highlight_line(self, lineno: int, color: QColor):
|
||||
block = self.document().findBlockByLineNumber(lineno - 1)
|
||||
if block.isValid():
|
||||
cursor = QTextCursor(block)
|
||||
cursor.select(QTextCursor.SelectionType.LineUnderCursor)
|
||||
|
||||
extra = QTextEdit.ExtraSelection()
|
||||
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue]
|
||||
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue]
|
||||
|
||||
self.setExtraSelections(self.extraSelections() + [extra])
|
||||
|
||||
def clear_highlighting(self):
|
||||
self.highlight_current_line()
|
||||
|
||||
|
||||
# ------------------------
|
||||
# Main App
|
||||
# ------------------------
|
||||
class JinjaTester(QMainWindow):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.setWindowTitle("Jinja Template Tester")
|
||||
self.resize(1200, 800)
|
||||
|
||||
central = QWidget()
|
||||
main_layout = QVBoxLayout(central)
|
||||
|
||||
# -------- Top input area --------
|
||||
input_layout = QHBoxLayout()
|
||||
|
||||
# Template editor with label
|
||||
template_layout = QVBoxLayout()
|
||||
template_label = QLabel("Jinja2 Template")
|
||||
template_layout.addWidget(template_label)
|
||||
self.template_edit = CodeEditor()
|
||||
template_layout.addWidget(self.template_edit)
|
||||
input_layout.addLayout(template_layout)
|
||||
|
||||
# JSON editor with label
|
||||
json_layout = QVBoxLayout()
|
||||
json_label = QLabel("Context (JSON)")
|
||||
json_layout.addWidget(json_label)
|
||||
self.json_edit = CodeEditor()
|
||||
self.json_edit.setPlainText("""
|
||||
{
|
||||
"add_generation_prompt": true,
|
||||
"bos_token": "",
|
||||
"eos_token": "",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the capital of Poland?"
|
||||
}
|
||||
]
|
||||
}
|
||||
""".strip())
|
||||
json_layout.addWidget(self.json_edit)
|
||||
input_layout.addLayout(json_layout)
|
||||
|
||||
main_layout.addLayout(input_layout)
|
||||
|
||||
# -------- Rendered output area --------
|
||||
output_label = QLabel("Rendered Output")
|
||||
main_layout.addWidget(output_label)
|
||||
self.output_edit = QPlainTextEdit()
|
||||
self.output_edit.setReadOnly(True)
|
||||
main_layout.addWidget(self.output_edit)
|
||||
|
||||
# -------- Render button and status --------
|
||||
btn_layout = QHBoxLayout()
|
||||
|
||||
# Load template button
|
||||
self.load_btn = QPushButton("Load Template")
|
||||
self.load_btn.clicked.connect(self.load_template)
|
||||
btn_layout.addWidget(self.load_btn)
|
||||
|
||||
# Format template button
|
||||
self.format_btn = QPushButton("Format")
|
||||
self.format_btn.clicked.connect(self.format_template)
|
||||
btn_layout.addWidget(self.format_btn)
|
||||
|
||||
self.render_btn = QPushButton("Render")
|
||||
self.render_btn.clicked.connect(self.render_template)
|
||||
btn_layout.addWidget(self.render_btn)
|
||||
main_layout.addLayout(btn_layout)
|
||||
|
||||
# Status label below buttons
|
||||
self.status_label = QLabel("Ready")
|
||||
main_layout.addWidget(self.status_label)
|
||||
|
||||
self.setCentralWidget(central)
|
||||
|
||||
def render_template(self):
|
||||
self.template_edit.clear_highlighting()
|
||||
self.output_edit.clear()
|
||||
|
||||
template_str = self.template_edit.toPlainText()
|
||||
json_str = self.json_edit.toPlainText()
|
||||
|
||||
# Parse JSON context
|
||||
try:
|
||||
context = json.loads(json_str) if json_str.strip() else {}
|
||||
except Exception as e:
|
||||
self.status_label.setText(f"❌ JSON Error: {e}")
|
||||
return
|
||||
|
||||
def raise_exception(text: str) -> str:
|
||||
raise RuntimeError(text)
|
||||
|
||||
env = ImmutableSandboxedEnvironment(
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
extensions=[jinja2_ext.loopcontrols],
|
||||
)
|
||||
env.filters["tojson"] = (
|
||||
lambda x,
|
||||
indent=None,
|
||||
separators=None,
|
||||
sort_keys=False,
|
||||
ensure_ascii=False: json.dumps(
|
||||
x,
|
||||
indent=indent,
|
||||
separators=separators,
|
||||
sort_keys=sort_keys,
|
||||
ensure_ascii=ensure_ascii,
|
||||
)
|
||||
)
|
||||
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
|
||||
env.globals["raise_exception"] = raise_exception
|
||||
try:
|
||||
template = env.from_string(template_str)
|
||||
output = template.render(context)
|
||||
self.output_edit.setPlainText(output)
|
||||
self.status_label.setText("✅ Render successful")
|
||||
except TemplateSyntaxError as e:
|
||||
self.status_label.setText(f"❌ Syntax Error (line {e.lineno}): {e.message}")
|
||||
if e.lineno:
|
||||
self.template_edit.highlight_line(e.lineno, QColor("red"))
|
||||
except Exception as e:
|
||||
# Catch all runtime errors
|
||||
# Try to extract template line number
|
||||
lineno = None
|
||||
tb = e.__traceback__
|
||||
while tb:
|
||||
frame = tb.tb_frame
|
||||
if frame.f_code.co_filename == "<template>":
|
||||
lineno = tb.tb_lineno
|
||||
break
|
||||
tb = tb.tb_next
|
||||
|
||||
error_msg = f"Runtime Error: {type(e).__name__}: {e}"
|
||||
if lineno:
|
||||
error_msg = f"Runtime Error at line {lineno} in template: {type(e).__name__}: {e}"
|
||||
self.template_edit.highlight_line(lineno, QColor("orange"))
|
||||
|
||||
self.output_edit.setPlainText(error_msg)
|
||||
self.status_label.setText(f"❌ {error_msg}")
|
||||
|
||||
def load_template(self):
|
||||
"""Load a Jinja template from a file using a file dialog."""
|
||||
file_path, _ = QFileDialog.getOpenFileName(
|
||||
self,
|
||||
"Load Jinja Template",
|
||||
"",
|
||||
"Template Files (*.jinja *.j2 *.html *.txt);;All Files (*)",
|
||||
)
|
||||
|
||||
if file_path:
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
content = file.read()
|
||||
self.template_edit.setPlainText(content)
|
||||
self.status_label.setText(f"✅ Loaded template from {file_path}")
|
||||
except Exception as e:
|
||||
self.status_label.setText(f"❌ Error loading file: {str(e)}")
|
||||
|
||||
def format_template(self):
|
||||
"""Format the Jinja template using Jinja2's lexer for proper parsing."""
|
||||
try:
|
||||
template_content = self.template_edit.toPlainText()
|
||||
if not template_content.strip():
|
||||
self.status_label.setText("⚠️ Template is empty")
|
||||
return
|
||||
|
||||
formatted_content = format_template_content(template_content)
|
||||
self.template_edit.setPlainText(formatted_content)
|
||||
self.status_label.setText("✅ Template formatted")
|
||||
except Exception as e:
|
||||
self.status_label.setText(f"❌ Error formatting template: {str(e)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
# CLI mode
|
||||
parser = argparse.ArgumentParser(description="Jinja Template Tester")
|
||||
parser.add_argument(
|
||||
"--template", required=True, help="Path to Jinja template file"
|
||||
)
|
||||
parser.add_argument("--context", required=True, help="JSON string for context")
|
||||
parser.add_argument(
|
||||
"--action",
|
||||
choices=["format", "render"],
|
||||
default="render",
|
||||
help="Action to perform",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load template
|
||||
with open(args.template, "r", encoding="utf-8") as f:
|
||||
template_content = f.read()
|
||||
|
||||
# Load JSON
|
||||
context = json.loads(args.context)
|
||||
# Add missing variables
|
||||
context.setdefault("bos_token", "")
|
||||
context.setdefault("eos_token", "")
|
||||
context.setdefault("add_generation_prompt", False)
|
||||
|
||||
env = ImmutableSandboxedEnvironment()
|
||||
|
||||
if args.action == "format":
|
||||
formatted = format_template_content(template_content)
|
||||
print(formatted) # noqa: NP100
|
||||
elif args.action == "render":
|
||||
template = env.from_string(template_content)
|
||||
output = template.render(context)
|
||||
print(output) # noqa: NP100
|
||||
|
||||
else:
|
||||
# GUI mode
|
||||
app = QApplication(sys.argv)
|
||||
window = JinjaTester()
|
||||
window.show()
|
||||
sys.exit(app.exec())
|
||||
@@ -0,0 +1,2 @@
|
||||
PySide6
|
||||
jinja2
|
||||
@@ -53,7 +53,7 @@ import typer
|
||||
sys.path.insert(0, Path(__file__).parent.parent.as_posix())
|
||||
if True:
|
||||
from tools.server.tests.utils import ServerProcess
|
||||
from tools.server.tests.unit.test_tool_call import TIMEOUT_SERVER_START, do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
from tools.server.tests.unit.test_tool_call import do_test_calc_result, do_test_hello_world, do_test_weather
|
||||
|
||||
|
||||
@contextmanager
|
||||
@@ -335,7 +335,7 @@ def run(
|
||||
# server.debug = True
|
||||
|
||||
with scoped_server(server):
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=15 * 60)
|
||||
for ignore_chat_grammar in [False]:
|
||||
run(
|
||||
server,
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <map>
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
// vec
|
||||
@@ -215,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
}
|
||||
|
||||
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
||||
|
||||
// parse alora invocation sequence vector
|
||||
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||
if (kid >= 0) {
|
||||
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
|
||||
throw std::runtime_error("invalid gguf type for " + key);
|
||||
}
|
||||
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
|
||||
if (arr_type != GGUF_TYPE_UINT32) {
|
||||
throw std::runtime_error("invalid gguf element type for " + key);
|
||||
}
|
||||
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
|
||||
adapter.alora_invocation_tokens.resize(seq_len);
|
||||
std::copy(
|
||||
(const llama_token *)data,
|
||||
(const llama_token *)data + seq_len,
|
||||
adapter.alora_invocation_tokens.begin());
|
||||
}
|
||||
}
|
||||
|
||||
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
||||
@@ -450,3 +471,15 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||
if (!adapter) {
|
||||
return 0;
|
||||
}
|
||||
return adapter->alora_invocation_tokens.size();
|
||||
}
|
||||
|
||||
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
|
||||
GGML_ASSERT(adapter);
|
||||
return adapter->alora_invocation_tokens.data();
|
||||
}
|
||||
|
||||
@@ -70,6 +70,9 @@ struct llama_adapter_lora {
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
|
||||
+27
-4
@@ -45,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
||||
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_MAMBA2, "mamba2" },
|
||||
@@ -236,10 +237,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
||||
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
||||
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
||||
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
||||
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
||||
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
|
||||
|
||||
// deprecated
|
||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||
@@ -1038,6 +1040,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA_EMBEDDING,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
|
||||
@@ -49,6 +49,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_GEMMA3,
|
||||
LLM_ARCH_GEMMA3N,
|
||||
LLM_ARCH_GEMMA_EMBEDDING,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_MAMBA2,
|
||||
@@ -234,6 +235,7 @@ enum llm_kv {
|
||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
||||
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
||||
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
|
||||
|
||||
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
||||
LLM_KV_POSNET_BLOCK_COUNT,
|
||||
|
||||
@@ -285,6 +285,9 @@ llama_context::llama_context(
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs
|
||||
n_outputs = 1;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
// resolve automatic Flash Attention use
|
||||
@@ -1368,6 +1371,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
GGML_ASSERT(n_outputs >= 1);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
|
||||
+54
-9
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
||||
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
||||
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
|
||||
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
|
||||
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
|
||||
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
|
||||
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
||||
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
||||
|
||||
LLAMA_LOG_DEBUG(" ");
|
||||
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||
LLAMA_LOG_DEBUG("%2d", j);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
|
||||
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
|
||||
LLAMA_LOG_DEBUG(" %2d ", i);
|
||||
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||
float val = data[i * n_kv + j];
|
||||
if (val == -INFINITY) {
|
||||
LLAMA_LOG_DEBUG(" ∞");
|
||||
} else {
|
||||
LLAMA_LOG_DEBUG(" 0");
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int64_t n_kv = ubatch->n_tokens;
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -267,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
float * data = (float *) kq_mask->data;
|
||||
|
||||
// [TAG_NO_CACHE_ISWA]
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
||||
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
||||
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
||||
@@ -277,21 +310,33 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
||||
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
||||
|
||||
// TODO: reimplement this like in llama_kv_cache
|
||||
if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
|
||||
if (hparams.use_alibi) {
|
||||
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
break;
|
||||
if (s0 != s1) {
|
||||
continue; // skip different sequences
|
||||
}
|
||||
|
||||
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
|
||||
continue; // skip future tokens for causal attention
|
||||
}
|
||||
|
||||
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
|
||||
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
|
||||
// continue; // skip masked tokens for SWA
|
||||
//}
|
||||
|
||||
// TODO: reimplement this like in llama_kv_cache_unified
|
||||
if (hparams.use_alibi) {
|
||||
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
@@ -78,6 +78,11 @@ struct llm_graph_params;
|
||||
|
||||
class llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_i() {
|
||||
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
||||
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
||||
}
|
||||
|
||||
virtual ~llm_graph_input_i() = default;
|
||||
|
||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||
@@ -90,6 +95,9 @@ public:
|
||||
GGML_UNUSED(params);
|
||||
return false;
|
||||
}
|
||||
protected:
|
||||
// env: LLAMA_GRAPH_INPUT_DEBUG
|
||||
int debug = 0;
|
||||
};
|
||||
|
||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include <cassert>
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||
{
|
||||
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||
const int32_t pos_diff = p1 - p0;
|
||||
|
||||
// Mask if outside the symmetric window
|
||||
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
+9
-3
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
|
||||
};
|
||||
|
||||
enum llama_swa_type {
|
||||
LLAMA_SWA_TYPE_NONE = 0,
|
||||
LLAMA_SWA_TYPE_STANDARD = 1,
|
||||
LLAMA_SWA_TYPE_CHUNKED = 2,
|
||||
LLAMA_SWA_TYPE_NONE = 0,
|
||||
LLAMA_SWA_TYPE_STANDARD = 1,
|
||||
LLAMA_SWA_TYPE_CHUNKED = 2,
|
||||
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
||||
};
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
@@ -227,6 +228,11 @@ struct llama_hparams {
|
||||
|
||||
// number of layers for which has_kv() returns true
|
||||
uint32_t n_layer_kv() const;
|
||||
|
||||
// note that this function uses different SWA parameters from those in the hparams
|
||||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
+1
-23
@@ -1393,29 +1393,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||
}
|
||||
|
||||
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
assert(p0 >= 0 && p1 >= 0);
|
||||
|
||||
switch (swa_type) {
|
||||
case LLAMA_SWA_TYPE_NONE:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_STANDARD:
|
||||
{
|
||||
if (p1 - p0 >= (int32_t) n_swa) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
case LLAMA_SWA_TYPE_CHUNKED:
|
||||
{
|
||||
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||
|
||||
if (p0 < pos_chunk_start) {
|
||||
return true;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
return false;
|
||||
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
|
||||
@@ -212,6 +212,7 @@ private:
|
||||
// env: LLAMA_KV_CACHE_DEBUG
|
||||
int debug = 0;
|
||||
|
||||
// this is the SWA type of the cache - not to be confused with the model SWA type
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
@@ -1142,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.causal_attn = false; // embeddings do not use causal attention
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_0_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3484,6 +3504,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -11045,6 +11066,137 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
||||
llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
if (ubatch.token) {
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
||||
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].attn_post_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(sa_out, "sa_out", il);
|
||||
|
||||
cur = build_norm(sa_out,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: move up next to build_starcoder
|
||||
struct llm_build_starcoder2 : public llm_graph_context {
|
||||
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
@@ -18481,6 +18633,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
|
||||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
{
|
||||
@@ -18761,6 +18914,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
llm = std::make_unique<llm_build_starcoder2>(*this, params);
|
||||
@@ -19161,6 +19318,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_OPENELM:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
@@ -6741,8 +6742,90 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
static void list_all_ops() {
|
||||
printf("GGML operations:\n");
|
||||
std::set<std::string> all_ops;
|
||||
|
||||
for (int i = 1; i < GGML_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_op_name((enum ggml_op)i));
|
||||
}
|
||||
for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));
|
||||
}
|
||||
for (int i = 0; i < GGML_GLU_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_glu_op_name((enum ggml_glu_op)i));
|
||||
}
|
||||
for (const auto & op : all_ops) {
|
||||
printf(" %s\n", op.c_str());
|
||||
}
|
||||
printf("\nTotal: %zu operations\n", all_ops.size());
|
||||
}
|
||||
|
||||
static void show_test_coverage() {
|
||||
std::set<std::string> all_ops;
|
||||
for (int i = 1; i < GGML_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_op_name((enum ggml_op)i));
|
||||
}
|
||||
for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));
|
||||
}
|
||||
for (int i = 0; i < GGML_GLU_OP_COUNT; i++) {
|
||||
all_ops.insert(ggml_glu_op_name((enum ggml_glu_op)i));
|
||||
}
|
||||
auto test_cases = make_test_cases_eval();
|
||||
std::set<std::string> tested_ops;
|
||||
|
||||
ggml_init_params params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||
/* .mem_base = */ NULL,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
|
||||
for (auto & test_case : test_cases) {
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (ctx) {
|
||||
test_case->mode = MODE_TEST;
|
||||
ggml_tensor * out = test_case->build_graph(ctx);
|
||||
if (out && out->op != GGML_OP_NONE) {
|
||||
if (out->op == GGML_OP_UNARY) {
|
||||
tested_ops.insert(ggml_unary_op_name(ggml_get_unary_op(out)));
|
||||
} else if (out->op == GGML_OP_GLU) {
|
||||
tested_ops.insert(ggml_glu_op_name(ggml_get_glu_op(out)));
|
||||
} else {
|
||||
tested_ops.insert(ggml_op_name(out->op));
|
||||
}
|
||||
}
|
||||
ggml_free(ctx);
|
||||
}
|
||||
}
|
||||
std::set<std::string> covered_ops;
|
||||
std::set<std::string> uncovered_ops;
|
||||
for (const auto & op : all_ops) {
|
||||
if (tested_ops.count(op) > 0) {
|
||||
covered_ops.insert(op);
|
||||
} else {
|
||||
uncovered_ops.insert(op);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Operations covered by tests (%zu):\n", covered_ops.size());
|
||||
for (const auto & op : covered_ops) {
|
||||
printf(" ✓ %s\n", op.c_str());
|
||||
}
|
||||
printf("\nOperations without tests (%zu):\n", uncovered_ops.size());
|
||||
for (const auto & op : uncovered_ops) {
|
||||
printf(" ✗ %s\n", op.c_str());
|
||||
}
|
||||
|
||||
printf("\nCoverage Summary:\n");
|
||||
printf(" Total operations: %zu\n", all_ops.size());
|
||||
printf(" Tested operations: %zu\n", covered_ops.size());
|
||||
printf(" Untested operations: %zu\n", uncovered_ops.size());
|
||||
printf(" Coverage: %.1f%%\n", (double)covered_ops.size() / all_ops.size() * 100.0);
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>]\n", argv[0]);
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
|
||||
printf(" valid modes:\n");
|
||||
printf(" - test (default, compare with CPU backend for correctness)\n");
|
||||
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
||||
@@ -6751,6 +6834,8 @@ static void usage(char ** argv) {
|
||||
printf(" op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc),\n");
|
||||
printf(" optionally including the full test case string (e.g. \"ADD(type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1)\")\n");
|
||||
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@@ -6800,6 +6885,12 @@ int main(int argc, char ** argv) {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--list-ops") == 0) {
|
||||
list_all_ops();
|
||||
return 0;
|
||||
} else if (strcmp(argv[i], "--show-coverage") == 0) {
|
||||
show_test_coverage();
|
||||
return 0;
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
|
||||
@@ -420,6 +420,7 @@ const common_chat_msg message_assist_call_empty_args = simple_assist
|
||||
const common_chat_msg message_assist_call_cutoff_args = simple_assist_msg("", "", "special_function", "{\"arg");
|
||||
const common_chat_msg message_assist_call_thoughts = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\":1}");
|
||||
const common_chat_msg message_assist_call_thoughts_unparsed = simple_assist_msg("<think>I'm\nthinking</think>\n\n", "", "special_function", "{\"arg1\": 1}");
|
||||
const common_chat_msg message_assist_call_thoughts_content = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": 1}");
|
||||
const common_chat_msg message_assist_call_id = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "123456789");
|
||||
const common_chat_msg message_assist_call_idx = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "0");
|
||||
const common_chat_msg message_assist_thoughts_call_idx = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}", /* id = */ "0");
|
||||
@@ -436,6 +437,7 @@ static void test_msgs_oaicompat_json_conversion() {
|
||||
message_assist_call,
|
||||
message_assist_call_thoughts,
|
||||
message_assist_call_thoughts_unparsed,
|
||||
message_assist_call_thoughts_content,
|
||||
message_assist_call_id,
|
||||
message_assist_call_idx,
|
||||
message_assist_call_python,
|
||||
@@ -1755,6 +1757,77 @@ static void test_template_output_parsers() {
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_SEED_OSS}));
|
||||
}
|
||||
|
||||
{
|
||||
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-Nano-v2.jinja");
|
||||
std::vector<std::string> end_tokens{ "<SPECIAL_12>" };
|
||||
|
||||
assert_equals(COMMON_CHAT_FORMAT_NEMOTRON_V2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
|
||||
assert_equals(COMMON_CHAT_FORMAT_NEMOTRON_V2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
||||
|
||||
// Test parsing regular content
|
||||
assert_msg_equals(message_assist,
|
||||
common_chat_parse(
|
||||
"Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_NEMOTRON_V2}));
|
||||
|
||||
// Test parsing content with thinking
|
||||
assert_msg_equals(message_assist_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
}));
|
||||
|
||||
// Test parsing tool calls
|
||||
assert_msg_equals(message_assist_call,
|
||||
common_chat_parse(
|
||||
"<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_NEMOTRON_V2}));
|
||||
|
||||
// Test parsing tool calls with thinking
|
||||
assert_msg_equals(message_assist_call_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}));
|
||||
|
||||
// Test tool calls with extra content
|
||||
assert_msg_equals(message_assist_call_content,
|
||||
common_chat_parse(
|
||||
"<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_NEMOTRON_V2}
|
||||
));
|
||||
|
||||
// Test tool calls with extra content AND thinking
|
||||
assert_msg_equals(message_assist_call_thoughts_content,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}));
|
||||
|
||||
// Test template generation for regular content
|
||||
test_templates(tmpls.get(), end_tokens, message_assist, tools,
|
||||
"Hello, world!\nWhat's up?\n",
|
||||
/* expect_grammar_triggered= */ false);
|
||||
|
||||
// Test template generation for tool calls
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
|
||||
"<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
|
||||
/* expect_grammar_triggered= */ true
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_msg_diffs_compute() {
|
||||
|
||||
@@ -512,6 +512,8 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||
|
||||
`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 3 values: `total`, `cache` and `processed`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. Default: `false`
|
||||
|
||||
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
|
||||
|
||||
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
|
||||
@@ -1276,6 +1278,34 @@ curl http://localhost:8080/v1/chat/completions \
|
||||
|
||||
**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
|
||||
|
||||
*Timings and context usage*
|
||||
|
||||
The response contains a `timings` object, for example:
|
||||
|
||||
```js
|
||||
{
|
||||
"choices": [],
|
||||
"created": 1757141666,
|
||||
"id": "chatcmpl-ecQULm0WqPrftUqjPZO1CFYeDjGZNbDu",
|
||||
// ...
|
||||
"timings": {
|
||||
"cache_n": 236, // number of prompt tokens reused from cache
|
||||
"prompt_n": 1, // number of prompt tokens being processed
|
||||
"prompt_ms": 30.958,
|
||||
"prompt_per_token_ms": 30.958,
|
||||
"prompt_per_second": 32.301828283480845,
|
||||
"predicted_n": 35, // number of predicted tokens
|
||||
"predicted_ms": 661.064,
|
||||
"predicted_per_token_ms": 18.887542857142858,
|
||||
"predicted_per_second": 52.94494935437416
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This provides information on the performance of the server. It also allows calculating the current context usage.
|
||||
|
||||
The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
|
||||
|
||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||
|
||||
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
|
||||
|
||||
+194
-24
@@ -110,14 +110,15 @@ static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
}
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool return_progress = false;
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
|
||||
int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters
|
||||
|
||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
@@ -307,11 +308,11 @@ struct server_task {
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
params.verbose = params_base.verbosity > 9;
|
||||
params.timings_per_token = json_value(data, "timings_per_token", false);
|
||||
|
||||
params.stream = json_value(data, "stream", false);
|
||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||
params.return_tokens = json_value(data, "return_tokens", false);
|
||||
params.return_progress = json_value(data, "return_progress", false);
|
||||
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
||||
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||
@@ -608,6 +609,8 @@ struct server_task {
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
int32_t cache_n = -1;
|
||||
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
@@ -624,6 +627,8 @@ struct result_timings {
|
||||
|
||||
json to_json() const {
|
||||
json base = {
|
||||
{"cache_n", cache_n},
|
||||
|
||||
{"prompt_n", prompt_n},
|
||||
{"prompt_ms", prompt_ms},
|
||||
{"prompt_per_token_ms", prompt_per_token_ms},
|
||||
@@ -644,6 +649,22 @@ struct result_timings {
|
||||
}
|
||||
};
|
||||
|
||||
struct result_prompt_progress {
|
||||
int32_t total = 0;
|
||||
int32_t cache = 0;
|
||||
int32_t processed = 0;
|
||||
int64_t time_ms = 0;
|
||||
|
||||
json to_json() const {
|
||||
return json {
|
||||
{"total", total},
|
||||
{"cache", cache},
|
||||
{"processed", processed},
|
||||
{"time_ms", time_ms},
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
@@ -999,8 +1020,10 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
bool post_sampling_probs;
|
||||
bool is_progress = false;
|
||||
completion_token_output prob_output;
|
||||
result_timings timings;
|
||||
result_prompt_progress progress;
|
||||
|
||||
// OAI-compat fields
|
||||
bool verbose = false;
|
||||
@@ -1045,6 +1068,9 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
if (timings.prompt_n > 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
res.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
if (!prob_output.probs.empty()) {
|
||||
res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
|
||||
}
|
||||
@@ -1082,6 +1108,9 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
if (timings.prompt_n >= 0) {
|
||||
res.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
res.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -1109,7 +1138,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
});
|
||||
};
|
||||
// We have to send an initial update to conform to openai behavior
|
||||
if (first) {
|
||||
if (first || is_progress) {
|
||||
add_delta({
|
||||
{"role", "assistant"},
|
||||
{"content", nullptr},
|
||||
@@ -1121,16 +1150,20 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
}
|
||||
|
||||
if (!deltas.empty()) {
|
||||
GGML_ASSERT(deltas[deltas.size() - 1].at("choices").size() >= 1);
|
||||
auto & last_json = deltas[deltas.size() - 1];
|
||||
GGML_ASSERT(last_json.at("choices").size() >= 1);
|
||||
|
||||
if (prob_output.probs.size() > 0) {
|
||||
deltas[deltas.size() - 1].at("choices").at(0)["logprobs"] = json {
|
||||
last_json.at("choices").at(0)["logprobs"] = json {
|
||||
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
||||
};
|
||||
}
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
deltas[deltas.size() - 1].push_back({"timings", timings.to_json()});
|
||||
last_json.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
if (is_progress) {
|
||||
last_json.push_back({"prompt_progress", progress.to_json()});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1382,6 +1415,7 @@ struct server_slot {
|
||||
common_speculative * spec = nullptr;
|
||||
|
||||
std::vector<common_adapter_lora_info> lora;
|
||||
int32_t alora_invocation_start = -1;
|
||||
|
||||
// the index relative to completion multi-task request
|
||||
size_t index = 0;
|
||||
@@ -1403,6 +1437,7 @@ struct server_slot {
|
||||
|
||||
// n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated
|
||||
int32_t n_prompt_tokens = 0;
|
||||
int32_t n_prompt_tokens_cache = 0;
|
||||
int32_t n_prompt_tokens_processed = 0;
|
||||
|
||||
// input prompt tokens
|
||||
@@ -1455,7 +1490,9 @@ struct server_slot {
|
||||
void reset() {
|
||||
SLT_DBG(*this, "%s", "\n");
|
||||
|
||||
n_prompt_tokens = 0;
|
||||
n_prompt_tokens = 0;
|
||||
n_prompt_tokens_cache = 0;
|
||||
|
||||
last_nl_pos = 0;
|
||||
generated_text = "";
|
||||
has_new_line = false;
|
||||
@@ -1476,6 +1513,9 @@ struct server_slot {
|
||||
// clear speculative decoding stats
|
||||
n_draft_total = 0;
|
||||
n_draft_accepted = 0;
|
||||
|
||||
// clear alora start
|
||||
alora_invocation_start = -1;
|
||||
}
|
||||
|
||||
bool need_embd() const {
|
||||
@@ -1543,6 +1583,8 @@ struct server_slot {
|
||||
|
||||
result_timings get_timings() const {
|
||||
result_timings timings;
|
||||
timings.cache_n = n_prompt_tokens_cache;
|
||||
|
||||
timings.prompt_n = n_prompt_tokens_processed;
|
||||
timings.prompt_ms = t_prompt_processing;
|
||||
timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
|
||||
@@ -2267,6 +2309,12 @@ struct server_context {
|
||||
|
||||
metrics.init();
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
SRV_INF("Enable thinking? %d\n", enable_thinking);
|
||||
|
||||
oai_parser_opt = {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
@@ -2275,7 +2323,7 @@ struct server_context {
|
||||
/* common_chat_templates */ chat_templates.get(),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ params_base.reasoning_budget != 0,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2361,11 +2409,65 @@ struct server_context {
|
||||
slot.prompt_tokens = std::move(task.prompt_tokens);
|
||||
|
||||
if (!are_lora_equal(slot.params.lora, slot.lora)) {
|
||||
// if lora is changed, we cannot reuse cached tokens
|
||||
slot.cache_tokens.clear();
|
||||
// if lora has changed, check to see if the cache should be cleared
|
||||
if (lora_should_clear_cache(slot.lora, slot.params.lora)) {
|
||||
SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), slot.params.lora.size());
|
||||
slot.cache_tokens.clear();
|
||||
} else {
|
||||
SLT_INF(slot, "keeping cache for alora. %zu target loras\n", slot.params.lora.size());
|
||||
}
|
||||
slot.lora = slot.params.lora;
|
||||
}
|
||||
|
||||
// if using alora, make sure it's only a single one requested and active
|
||||
size_t alora_invocation_start = slot.prompt_tokens.size();
|
||||
if (lora_all_alora(slot.lora)) {
|
||||
|
||||
const auto & enabled_ids = lora_get_enabled_ids(slot.lora);
|
||||
// TODO: This will error out if a user requests two aloras, but only
|
||||
// provides the activation string for one. We could, instead search
|
||||
// for all requested alora activation strings and then either keep
|
||||
// only the last one, or reject if multiple are found.
|
||||
if (enabled_ids.size() != 1) {
|
||||
send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
const auto & lora = slot.lora[enabled_ids[0]].ptr;
|
||||
|
||||
// get the pointer and count for the invocation tokens
|
||||
const uint64_t n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora);
|
||||
const llama_token * invocation_tokens = llama_adapter_get_alora_invocation_tokens (lora);
|
||||
|
||||
// scan backwards through the prompt tokens to find the last
|
||||
// occurrence of the invocation sequence
|
||||
int match_idx = static_cast<int>(n_invocation_tokens) - 1;
|
||||
for (int i = slot.prompt_tokens.size() - 1; i >= 0; --i) {
|
||||
// the token in this position matches the next token to find in
|
||||
// the invocation sequence
|
||||
if (slot.prompt_tokens[i] == invocation_tokens[match_idx]) {
|
||||
// if it's a full match, we've found the start
|
||||
if (match_idx == 0) {
|
||||
alora_invocation_start = i;
|
||||
break;
|
||||
}
|
||||
// otherwise, check the next token in the sequence
|
||||
--match_idx;
|
||||
} else {
|
||||
// no match in this position, so start looking over again
|
||||
match_idx = static_cast<int>(n_invocation_tokens) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// if the activation string is not found, disable the alora
|
||||
if (alora_invocation_start == slot.prompt_tokens.size()) {
|
||||
SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]);
|
||||
slot.lora[enabled_ids[0]].scale = 0.0f;
|
||||
} else {
|
||||
SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start);
|
||||
slot.alora_invocation_start = alora_invocation_start;
|
||||
}
|
||||
}
|
||||
|
||||
if (!slot.prompt_tokens.validate(ctx)) {
|
||||
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
@@ -2456,7 +2558,7 @@ struct server_context {
|
||||
|
||||
slot.add_token(result);
|
||||
if (slot.params.stream) {
|
||||
send_partial_response(slot, result);
|
||||
send_partial_response(slot, result, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2648,13 +2750,24 @@ struct server_context {
|
||||
return true;
|
||||
}
|
||||
|
||||
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
|
||||
void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
|
||||
auto res = std::make_unique<server_task_result_cmpl_partial>();
|
||||
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
res->content = tkn.text_to_send;
|
||||
res->tokens = { tkn.tok };
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
|
||||
if (is_progress) {
|
||||
res->is_progress = true;
|
||||
res->progress.total = slot.n_prompt_tokens;
|
||||
res->progress.cache = slot.n_prompt_tokens_cache;
|
||||
res->progress.processed = slot.cache_tokens.size();
|
||||
res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000);
|
||||
} else {
|
||||
res->content = tkn.text_to_send;
|
||||
res->tokens = { tkn.tok };
|
||||
|
||||
slot.update_chat_msg(res->oaicompat_msg_diffs);
|
||||
}
|
||||
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
@@ -2665,8 +2778,6 @@ struct server_context {
|
||||
res->oaicompat_model = slot.params.oaicompat_model;
|
||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||
|
||||
slot.update_chat_msg(res->oaicompat_msg_diffs);
|
||||
|
||||
// populate res.probs_output
|
||||
if (slot.params.sampling.n_probs > 0) {
|
||||
res->prob_output = tkn; // copy the token probs
|
||||
@@ -3241,6 +3352,8 @@ struct server_context {
|
||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
float alora_scale = -1.0f;
|
||||
size_t alora_disabled_id = 0;
|
||||
if (params_base.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & slot : slots) {
|
||||
// check if we can batch this slot with the previous one
|
||||
@@ -3361,6 +3474,12 @@ struct server_context {
|
||||
// reuse any previously computed tokens that are common with the new prompt
|
||||
slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
|
||||
|
||||
// if there is an alora invoked, don't cache after the invocation start
|
||||
if (slot.alora_invocation_start >= 0) {
|
||||
SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start);
|
||||
slot.n_past = std::min(slot.n_past, slot.alora_invocation_start - 1);
|
||||
}
|
||||
|
||||
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||
if (params_base.n_cache_reuse > 0) {
|
||||
size_t head_c = slot.n_past; // cache
|
||||
@@ -3485,6 +3604,7 @@ struct server_context {
|
||||
slot.n_past--;
|
||||
}
|
||||
|
||||
slot.n_prompt_tokens_cache = slot.n_past;
|
||||
slot.n_prompt_tokens_processed = 0;
|
||||
}
|
||||
|
||||
@@ -3501,7 +3621,8 @@ struct server_context {
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_past = 0;
|
||||
slot.n_past = 0;
|
||||
slot.n_prompt_tokens_cache = 0;
|
||||
}
|
||||
|
||||
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||
@@ -3533,6 +3654,20 @@ struct server_context {
|
||||
slot.n_prompt_tokens_processed += n_pos;
|
||||
}
|
||||
|
||||
// If using an alora, there may be uncached tokens that come
|
||||
// before the invocation sequence. When this happens, the
|
||||
// tokens before the invocation sequence need to be
|
||||
// processed without the adpter in a separate batch, then
|
||||
// the adapter needs to be enabled for the remaining tokens.
|
||||
if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.n_past) {
|
||||
SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start);
|
||||
const auto & enabled_loras = lora_get_enabled_ids(slot.lora);
|
||||
GGML_ASSERT(enabled_loras.size() == 1);
|
||||
alora_scale = slot.lora[enabled_loras[0]].scale;
|
||||
slot.lora[enabled_loras[0]].scale = 0.0f;
|
||||
alora_disabled_id = enabled_loras[0];
|
||||
}
|
||||
|
||||
// add prompt tokens for processing in the current batch
|
||||
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
||||
// get next token to process
|
||||
@@ -3541,6 +3676,14 @@ struct server_context {
|
||||
break; // end of text chunk
|
||||
}
|
||||
|
||||
// if this is an alora request with pre-invocation
|
||||
// tokens that are not cached, we need to stop filling
|
||||
// this batch at those pre-invocation tokens.
|
||||
if (alora_scale > 0 && slot.n_past == slot.alora_invocation_start - 1) {
|
||||
SLT_DBG(slot, "stop prompt batch filling at (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start);
|
||||
break;
|
||||
}
|
||||
|
||||
// embedding requires all tokens in the batch to be output
|
||||
const bool need_embd = server_task_type_need_embd(slot.task_type);
|
||||
|
||||
@@ -3599,6 +3742,13 @@ struct server_context {
|
||||
// apply lora, only need to do it once per batch
|
||||
common_set_adapter_lora(ctx, slot_batched->lora);
|
||||
|
||||
// if the lora is temporarily disabled for an alora, re-enable it
|
||||
// for next time
|
||||
if (alora_scale > 0.0f) {
|
||||
SRV_DBG("re-enabling alora with scale %f\n", alora_scale);
|
||||
slot_batched->lora[alora_disabled_id].scale = alora_scale;
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
}
|
||||
|
||||
@@ -3666,6 +3816,13 @@ struct server_context {
|
||||
n_batch = llama_n_batch(ctx);
|
||||
|
||||
for (auto & slot : slots) {
|
||||
// optionally send prompt processing progress
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
|
||||
if (slot.params.stream && slot.params.return_progress) {
|
||||
send_partial_response(slot, {}, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
@@ -4984,13 +5141,26 @@ int main(int argc, char ** argv) {
|
||||
const auto & loras = ctx_server.params_base.lora_adapters;
|
||||
for (size_t i = 0; i < loras.size(); ++i) {
|
||||
auto & lora = loras[i];
|
||||
result.push_back({
|
||||
json entry = {
|
||||
{"id", i},
|
||||
{"path", lora.path},
|
||||
{"scale", lora.scale},
|
||||
{"task_name", lora.task_name},
|
||||
{"prompt_prefix", lora.prompt_prefix},
|
||||
});
|
||||
};
|
||||
std::string alora_invocation_string = "";
|
||||
const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
if (n_alora_tokens) {
|
||||
const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
|
||||
for (uint64_t i = 0; i < n_alora_tokens; ++i) {
|
||||
alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
|
||||
alora_invocation_tokens.push_back(alora_tokens[i]);
|
||||
}
|
||||
entry["alora_invocation_string"] = alora_invocation_string;
|
||||
entry["alora_invocation_tokens"] = alora_invocation_tokens;
|
||||
}
|
||||
result.push_back(std::move(entry));
|
||||
}
|
||||
res_ok(res, result);
|
||||
res.status = 200; // HTTP OK
|
||||
|
||||
@@ -5,6 +5,12 @@ from utils import *
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def do_something():
|
||||
# this will be run once per test session, before any tests
|
||||
ServerPreset.load_all()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
|
||||
@@ -402,3 +402,51 @@ def test_context_size_exceeded():
|
||||
assert server.n_ctx is not None
|
||||
assert server.n_slots is not None
|
||||
assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_batch,batch_count,reuse_cache",
|
||||
[
|
||||
(64, 15, False),
|
||||
(64, 1, True),
|
||||
]
|
||||
)
|
||||
def test_return_progresssss(n_batch, batch_count, reuse_cache):
|
||||
global server
|
||||
server.n_batch = n_batch
|
||||
server.n_ctx = 2048
|
||||
server.n_slots = 1
|
||||
server.start()
|
||||
def make_cmpl_request():
|
||||
return server.make_stream_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 10,
|
||||
"messages": [
|
||||
{"role": "user", "content": "This is a test" * 100},
|
||||
],
|
||||
"stream": True,
|
||||
"return_progress": True,
|
||||
})
|
||||
if reuse_cache:
|
||||
# make a first request to populate the cache
|
||||
res0 = make_cmpl_request()
|
||||
for _ in res0:
|
||||
pass # discard the output
|
||||
|
||||
res = make_cmpl_request()
|
||||
last_progress = None
|
||||
total_batch_count = 0
|
||||
for data in res:
|
||||
cur_progress = data.get("prompt_progress", None)
|
||||
if cur_progress is None:
|
||||
continue
|
||||
if last_progress is not None:
|
||||
assert cur_progress["total"] == last_progress["total"]
|
||||
assert cur_progress["cache"] == last_progress["cache"]
|
||||
assert cur_progress["processed"] > last_progress["processed"]
|
||||
total_batch_count += 1
|
||||
last_progress = cur_progress
|
||||
|
||||
assert last_progress is not None
|
||||
assert last_progress["total"] > 0
|
||||
assert last_progress["processed"] == last_progress["total"]
|
||||
assert total_batch_count == batch_count
|
||||
|
||||
@@ -14,14 +14,11 @@ from utils import *
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
TIMEOUT_SERVER_START = 15*60
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.model_alias = "tinyllama-2"
|
||||
server.server_port = 8081
|
||||
server.n_slots = 1
|
||||
|
||||
|
||||
@@ -45,7 +42,7 @@ def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expe
|
||||
server.jinja = True
|
||||
server.reasoning_budget = reasoning_budget
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
@@ -68,7 +65,7 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]):
|
||||
global server
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
@@ -91,7 +88,7 @@ def test_add_generation_prompt(template_name: str, expected_generation_prompt: s
|
||||
global server
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
|
||||
@@ -12,7 +12,7 @@ from enum import Enum
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
TIMEOUT_SERVER_START = 15*60
|
||||
TIMEOUT_START_SLOW = 15 * 60 # this is needed for real model tests
|
||||
TIMEOUT_HTTP_REQUEST = 60
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -124,7 +124,7 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
|
||||
|
||||
|
||||
@@ -168,7 +168,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
|
||||
server.jinja = True
|
||||
server.n_predict = n_predict
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -240,7 +240,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
body = server.make_any_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
@@ -295,7 +295,7 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
|
||||
server.n_predict = n_predict
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -317,7 +317,7 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
|
||||
server.n_predict = n_predict
|
||||
server.jinja = True
|
||||
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
|
||||
|
||||
|
||||
@@ -436,7 +436,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
|
||||
|
||||
|
||||
@@ -524,7 +524,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start()
|
||||
body = server.make_any_request("POST", "/v1/chat/completions", data={
|
||||
"max_tokens": n_predict,
|
||||
"messages": [
|
||||
@@ -597,7 +597,7 @@ def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | Non
|
||||
assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
|
||||
elif isinstance(template_override, str):
|
||||
server.chat_template = template_override
|
||||
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||
server.start(timeout_seconds=TIMEOUT_START_SLOW)
|
||||
|
||||
do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
|
||||
|
||||
|
||||
@@ -5,18 +5,31 @@ import requests
|
||||
|
||||
server: ServerProcess
|
||||
|
||||
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
|
||||
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
|
||||
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
IMG_BASE64_URI_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
IMG_BASE64_0 = base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
IMG_BASE64_URI_1 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
IMG_BASE64_1 = base64.b64encode(response.content).decode("utf-8")
|
||||
def get_img_url(id: str) -> str:
|
||||
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
|
||||
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
|
||||
if id == "IMG_URL_0":
|
||||
return IMG_URL_0
|
||||
elif id == "IMG_URL_1":
|
||||
return IMG_URL_1
|
||||
elif id == "IMG_BASE64_URI_0":
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_0":
|
||||
response = requests.get(IMG_URL_0)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_URI_1":
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
elif id == "IMG_BASE64_1":
|
||||
response = requests.get(IMG_URL_1)
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
return base64.b64encode(response.content).decode("utf-8")
|
||||
else:
|
||||
return id
|
||||
|
||||
JSON_MULTIMODAL_KEY = "multimodal_data"
|
||||
JSON_PROMPT_STRING_KEY = "prompt_string"
|
||||
@@ -28,7 +41,7 @@ def create_server():
|
||||
|
||||
def test_models_supports_multimodal_capability():
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("GET", "/models", data={})
|
||||
assert res.status_code == 200
|
||||
model_info = res.body["models"][0]
|
||||
@@ -38,7 +51,7 @@ def test_models_supports_multimodal_capability():
|
||||
|
||||
def test_v1_models_supports_multimodal_capability():
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("GET", "/v1/models", data={})
|
||||
assert res.status_code == 200
|
||||
model_info = res.body["models"][0]
|
||||
@@ -50,10 +63,10 @@ def test_v1_models_supports_multimodal_capability():
|
||||
"prompt, image_url, success, re_content",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this:\n", IMG_URL_0, True, "(cat)+"),
|
||||
("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"), # exceptional, so that we don't cog up the log
|
||||
("What is this:\n", IMG_URL_1, True, "(frog)+"),
|
||||
("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache
|
||||
("What is this:\n", "IMG_URL_0", True, "(cat)+"),
|
||||
("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"),
|
||||
("What is this:\n", "IMG_URL_1", True, "(frog)+"),
|
||||
("Test test\n", "IMG_URL_1", True, "(frog)+"), # test invalidate cache
|
||||
("What is this:\n", "malformed", False, None),
|
||||
("What is this:\n", "https://google.com/404", False, None), # non-existent image
|
||||
("What is this:\n", "https://ggml.ai", False, None), # non-image data
|
||||
@@ -62,9 +75,7 @@ def test_v1_models_supports_multimodal_capability():
|
||||
)
|
||||
def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
global server
|
||||
server.start(timeout_seconds=60) # vision model may take longer to load due to download size
|
||||
if image_url == "IMG_BASE64_URI_0":
|
||||
image_url = IMG_BASE64_URI_0
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
@@ -72,7 +83,7 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": image_url,
|
||||
"url": get_img_url(image_url),
|
||||
}},
|
||||
]},
|
||||
],
|
||||
@@ -90,19 +101,22 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
"prompt, image_data, success, re_content",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this: <__media__>\n", IMG_BASE64_0, True, "(cat)+"),
|
||||
("What is this: <__media__>\n", IMG_BASE64_1, True, "(frog)+"),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_0", True, "(cat)+"),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_1", True, "(frog)+"),
|
||||
("What is this: <__media__>\n", "malformed", False, None), # non-image data
|
||||
("What is this:\n", "", False, None), # empty string
|
||||
]
|
||||
)
|
||||
def test_vision_completion(prompt, image_data, success, re_content):
|
||||
global server
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"prompt": { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
|
||||
"prompt": {
|
||||
JSON_PROMPT_STRING_KEY: prompt,
|
||||
JSON_MULTIMODAL_KEY: [ get_img_url(image_data) ],
|
||||
},
|
||||
})
|
||||
if success:
|
||||
assert res.status_code == 200
|
||||
@@ -116,17 +130,18 @@ def test_vision_completion(prompt, image_data, success, re_content):
|
||||
"prompt, image_data, success",
|
||||
[
|
||||
# test model is trained on CIFAR-10, but it's quite dumb due to small size
|
||||
("What is this: <__media__>\n", IMG_BASE64_0, True), # exceptional, so that we don't cog up the log
|
||||
("What is this: <__media__>\n", IMG_BASE64_1, True),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_0", True),
|
||||
("What is this: <__media__>\n", "IMG_BASE64_1", True),
|
||||
("What is this: <__media__>\n", "malformed", False), # non-image data
|
||||
("What is this:\n", "base64", False), # non-image data
|
||||
]
|
||||
)
|
||||
def test_vision_embeddings(prompt, image_data, success):
|
||||
global server
|
||||
server.server_embeddings=True
|
||||
server.n_batch=512
|
||||
server.start() # vision model may take longer to load due to download size
|
||||
server.server_embeddings = True
|
||||
server.n_batch = 512
|
||||
server.start()
|
||||
image_data = get_img_url(image_data)
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"content": [
|
||||
{ JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
|
||||
|
||||
@@ -26,7 +26,7 @@ from re import RegexFlag
|
||||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 30
|
||||
DEFAULT_HTTP_TIMEOUT = 60
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
@@ -45,6 +45,7 @@ class ServerProcess:
|
||||
model_alias: str = "tinyllama-2"
|
||||
temperature: float = 0.8
|
||||
seed: int = 42
|
||||
offline: bool = False
|
||||
|
||||
# custom options
|
||||
model_alias: str | None = None
|
||||
@@ -118,6 +119,8 @@ class ServerProcess:
|
||||
"--seed",
|
||||
self.seed,
|
||||
]
|
||||
if self.offline:
|
||||
server_args.append("--offline")
|
||||
if self.model_file:
|
||||
server_args.extend(["--model", self.model_file])
|
||||
if self.model_url:
|
||||
@@ -392,6 +395,19 @@ server_instances: Set[ServerProcess] = set()
|
||||
|
||||
|
||||
class ServerPreset:
|
||||
@staticmethod
|
||||
def load_all() -> None:
|
||||
""" Load all server presets to ensure model files are cached. """
|
||||
servers: List[ServerProcess] = [
|
||||
method()
|
||||
for name, method in ServerPreset.__dict__.items()
|
||||
if callable(method) and name != "load_all"
|
||||
]
|
||||
for server in servers:
|
||||
server.offline = False
|
||||
server.start()
|
||||
server.stop()
|
||||
|
||||
@staticmethod
|
||||
def tinyllama2() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
@@ -408,6 +424,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -422,6 +439,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -437,6 +455,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
|
||||
server.model_alias = "tinyllama-infill"
|
||||
@@ -451,6 +470,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def stories15m_moe() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/stories15M_MOE"
|
||||
server.model_hf_file = "stories15M_MOE-F16.gguf"
|
||||
server.model_alias = "stories15m-moe"
|
||||
@@ -465,6 +485,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def jina_reranker_tiny() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||
server.model_alias = "jina-reranker"
|
||||
@@ -478,6 +499,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinygemma3() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
# mmproj is already provided by HF registry API
|
||||
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
|
||||
server.model_hf_file = "tinygemma3-Q8_0.gguf"
|
||||
|
||||
+54
-3
@@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
||||
if (body.contains(key) && !body.at(key).is_null()) {
|
||||
try {
|
||||
return body.at(key);
|
||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
||||
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
|
||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
|
||||
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
|
||||
return default_value;
|
||||
}
|
||||
} else {
|
||||
@@ -708,6 +708,16 @@ static json oaicompat_chat_params_parse(
|
||||
inputs.chat_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
|
||||
// parse the "enable_thinking" kwarg to override the default value
|
||||
auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
|
||||
if (enable_thinking_kwarg == "true") {
|
||||
inputs.enable_thinking = true;
|
||||
} else if (enable_thinking_kwarg == "false") {
|
||||
inputs.enable_thinking = false;
|
||||
} else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
|
||||
throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
|
||||
}
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
|
||||
@@ -724,7 +734,7 @@ static json oaicompat_chat_params_parse(
|
||||
/* TODO: test this properly */
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
|
||||
if ( inputs.enable_thinking ) {
|
||||
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
|
||||
}
|
||||
|
||||
@@ -992,6 +1002,47 @@ static bool are_lora_equal(
|
||||
return true;
|
||||
}
|
||||
|
||||
// get the ids of all enabled loras
|
||||
static std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
|
||||
std::vector<size_t> enabled_ids;
|
||||
for (size_t i = 0; i < loras.size(); ++i) {
|
||||
if (loras[i].scale > 0) {
|
||||
enabled_ids.push_back(i);
|
||||
}
|
||||
}
|
||||
return enabled_ids;
|
||||
}
|
||||
|
||||
// check whether the given lora set has only aloras activated (empty => false)
|
||||
static bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
|
||||
bool found_alora = false;
|
||||
for (const auto & lora : loras) {
|
||||
if (lora.scale != 0) {
|
||||
if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
|
||||
return false;
|
||||
}
|
||||
found_alora = true;
|
||||
}
|
||||
}
|
||||
return found_alora;
|
||||
}
|
||||
|
||||
// if the two sets of loras are different, they require a cache clear unless the
|
||||
// change is only from aloras to aloras.
|
||||
static bool lora_should_clear_cache(
|
||||
const std::vector<common_adapter_lora_info> & current,
|
||||
const std::vector<common_adapter_lora_info> & next) {
|
||||
|
||||
// This should always be called after determining that the two sets are
|
||||
// _not_ equal. This assert is therefore some slightly wasted work and
|
||||
// should be safe to remove as long as this method is called correctly.
|
||||
GGML_ASSERT(!are_lora_equal(current, next));
|
||||
|
||||
return (
|
||||
!(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
|
||||
!lora_all_alora(next));
|
||||
}
|
||||
|
||||
// parse lora config from JSON request, returned a copy of lora_base with updated scale
|
||||
static std::vector<common_adapter_lora_info> parse_lora_request(
|
||||
const std::vector<common_adapter_lora_info> & lora_base,
|
||||
|
||||
Reference in New Issue
Block a user