mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-25 07:07:39 +02:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fdb2c11c70 | |||
| 09cedfd699 | |||
| 8be759e6f7 | |||
| 894bb27af3 | |||
| fb401045cc | |||
| 51eae8cfca | |||
| 1191758c5d | |||
| 00139b660b | |||
| ef9c13d4c2 | |||
| 88636e178f | |||
| ac4105d68b | |||
| be4a6a63eb | |||
| 72a9269172 | |||
| 92e854ab83 | |||
| c5606364b2 | |||
| 0eb874d374 |
@@ -142,7 +142,9 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
||||
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
||||
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
||||
- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
|
||||
- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
|
||||
- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
|
||||
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
|
||||
- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
|
||||
- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
|
||||
|
||||
@@ -80,8 +80,6 @@ add_library(${TARGET}
|
||||
http.h
|
||||
imatrix-loader.cpp
|
||||
imatrix-loader.h
|
||||
json-partial.cpp
|
||||
json-partial.h
|
||||
json-schema-to-grammar.cpp
|
||||
llguidance.cpp
|
||||
log.cpp
|
||||
|
||||
+3
-9
@@ -603,8 +603,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
|
||||
// model is required (except for server)
|
||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||
bool can_skip_model = params.usage || params.completion || !params.server_base.empty();
|
||||
if (!can_skip_model && params.model.path.empty()) {
|
||||
if (params.model.path.empty()
|
||||
&& !params.usage
|
||||
&& !params.completion) {
|
||||
throw std::invalid_argument("error: --model is required\n");
|
||||
}
|
||||
}
|
||||
@@ -1118,13 +1119,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.completion = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--server-base"}, "URL",
|
||||
string_format("connect to this server instead of starting a new one, example: 'http://localhost:8080' (default: none)"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.server_base = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--verbose-prompt"},
|
||||
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
||||
|
||||
@@ -2758,5 +2758,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
|
||||
GGML_ASSERT(chat_templates != nullptr);
|
||||
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||
if (chat_templates->template_tool_use != nullptr) {
|
||||
// take the more expressive template when available
|
||||
return chat_templates->template_tool_use->caps.to_map();
|
||||
}
|
||||
return chat_templates->template_default->caps.to_map();
|
||||
}
|
||||
|
||||
@@ -631,9 +631,6 @@ struct common_params {
|
||||
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// CLI params
|
||||
std::string server_base; // if set, connect to this server instead of starting a new one
|
||||
|
||||
// UI configs
|
||||
bool ui = true;
|
||||
bool ui_mcp_proxy = false;
|
||||
|
||||
@@ -2,16 +2,6 @@
|
||||
|
||||
#include <cpp-httplib/httplib.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
struct common_http_url {
|
||||
std::string scheme;
|
||||
std::string user;
|
||||
@@ -107,63 +97,3 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
|
||||
static std::string common_http_show_masked_url(const common_http_url & parts) {
|
||||
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
|
||||
}
|
||||
|
||||
static int common_http_get_free_port() {
|
||||
#ifdef _WIN32
|
||||
WSADATA wsaData;
|
||||
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
|
||||
return -1;
|
||||
}
|
||||
typedef SOCKET native_socket_t;
|
||||
#define INVALID_SOCKET_VAL INVALID_SOCKET
|
||||
#define CLOSE_SOCKET(s) closesocket(s)
|
||||
#else
|
||||
typedef int native_socket_t;
|
||||
#define INVALID_SOCKET_VAL -1
|
||||
#define CLOSE_SOCKET(s) close(s)
|
||||
#endif
|
||||
|
||||
native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (sock == INVALID_SOCKET_VAL) {
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct sockaddr_in serv_addr;
|
||||
std::memset(&serv_addr, 0, sizeof(serv_addr));
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
|
||||
serv_addr.sin_port = htons(0);
|
||||
|
||||
if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) {
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
int namelen = sizeof(serv_addr);
|
||||
#else
|
||||
socklen_t namelen = sizeof(serv_addr);
|
||||
#endif
|
||||
if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) {
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
int port = ntohs(serv_addr.sin_port);
|
||||
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
|
||||
return port;
|
||||
}
|
||||
|
||||
@@ -1,324 +0,0 @@
|
||||
#include "json-partial.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <regex>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum common_json_stack_element_type {
|
||||
COMMON_JSON_STACK_ELEMENT_OBJECT,
|
||||
COMMON_JSON_STACK_ELEMENT_KEY,
|
||||
COMMON_JSON_STACK_ELEMENT_ARRAY,
|
||||
};
|
||||
|
||||
struct common_json_stack_element {
|
||||
common_json_stack_element_type type;
|
||||
std::string key;
|
||||
};
|
||||
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
std::string::const_iterator it = input.begin();
|
||||
const auto end = input.end();
|
||||
return common_json_parse(it, end, healing_marker, out);
|
||||
}
|
||||
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
std::size_t position;
|
||||
bool found_error;
|
||||
std::string last_token;
|
||||
std::string exception_message;
|
||||
std::vector<common_json_stack_element> stack;
|
||||
|
||||
json_error_locator() : position(0), found_error(false) {}
|
||||
|
||||
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
|
||||
this->position = position - 1;
|
||||
this->found_error = true;
|
||||
this->last_token = last_token;
|
||||
this->exception_message = ex.what();
|
||||
return false;
|
||||
}
|
||||
void close_value() {
|
||||
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
|
||||
stack.pop_back();
|
||||
}
|
||||
}
|
||||
bool null() override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool boolean(bool) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_integer(number_integer_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_unsigned(number_unsigned_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_float(number_float_t, const string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool string(string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool binary(binary_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool start_object(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_object() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool key(string_t & key) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
|
||||
return true;
|
||||
}
|
||||
bool start_array(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_array() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
json_error_locator err_loc;
|
||||
auto start = it;
|
||||
json::sax_parse(it, end, &err_loc);
|
||||
|
||||
if (err_loc.found_error) {
|
||||
it = start;
|
||||
auto temptative_end = it + err_loc.position;
|
||||
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
|
||||
|
||||
auto input = std::string(it, temptative_end);
|
||||
try {
|
||||
out.json = json::parse(input);
|
||||
// out.json = json::parse(it, temptative_end);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
} catch (const std::exception & ex) {
|
||||
// No, needs healing.
|
||||
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
|
||||
}
|
||||
auto can_parse = [](const std::string & str) {
|
||||
try {
|
||||
auto _ = json::parse(str); // NOLINT
|
||||
return true;
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
if (!healing_marker.empty() && !err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
|
||||
if (last_non_sp_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
auto last_non_sp_char = str[last_non_sp_pos];
|
||||
// Used to detect stops on a number, which may not be complete.
|
||||
auto was_maybe_number = [&]() {
|
||||
if (!str.empty() && std::isspace(str.back())) {
|
||||
return false;
|
||||
}
|
||||
return std::isdigit(last_non_sp_char) ||
|
||||
last_non_sp_char == '.' ||
|
||||
last_non_sp_char == 'e' ||
|
||||
last_non_sp_char == 'E' ||
|
||||
last_non_sp_char == '-';
|
||||
};
|
||||
|
||||
std::string closing;
|
||||
for (size_t i = err_loc.stack.size(); i > 0; i--) {
|
||||
auto & el = err_loc.stack[i - 1];
|
||||
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
closing += "}";
|
||||
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
closing += "]";
|
||||
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
throw std::runtime_error("Unexpected stack element type");
|
||||
}
|
||||
}
|
||||
|
||||
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
|
||||
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
|
||||
|
||||
auto is_high_surrogate = [&](const std::string & s) {
|
||||
// Check if a partial of a high surrogate (U+D800-U+DBFF)
|
||||
return s.length() >= 4 &&
|
||||
s[0] == '\\' && s[1] == 'u' &&
|
||||
std::tolower(s[2]) == 'd' &&
|
||||
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
|
||||
};
|
||||
|
||||
// Initialize the unicode marker to a low surrogate to handle the edge case
|
||||
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
|
||||
// backslash (\)
|
||||
std::string unicode_marker_padding = "udc00";
|
||||
std::smatch last_unicode_seq;
|
||||
|
||||
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
|
||||
std::smatch second_last_seq;
|
||||
std::string prelude = str.substr(0, last_unicode_seq.position());
|
||||
|
||||
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
|
||||
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
|
||||
|
||||
if (is_high_surrogate(last_unicode_seq.str())) {
|
||||
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
|
||||
unicode_marker_padding += "\\udc00";
|
||||
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
|
||||
if (is_high_surrogate(second_last_seq.str())) {
|
||||
// If this follows a high surrogate, pad it to be a low surrogate
|
||||
if (last_unicode_seq.length() == 2) {
|
||||
unicode_marker_padding = "dc00";
|
||||
} else if (last_unicode_seq.length() == 3) {
|
||||
unicode_marker_padding = "c00";
|
||||
} else {
|
||||
// The original unicode_marker_padding is already padded with 0s
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
||||
|
||||
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
// We're inside an object value
|
||||
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an object value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + ": 1" + closing)) {
|
||||
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
|
||||
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
|
||||
// Was about to create an object
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an object value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an object value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
||||
// Was inside an object value string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
// find last :
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to opening : for object value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an array value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an array value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an array value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
|
||||
// Was inside an array value string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
||||
// Had just finished a value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of("[,");
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to last [ or , for array value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
|
||||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\": 1" + closing)) {
|
||||
// Was inside an object key string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
||||
// Was inside an object key string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
|
||||
// Was inside an object key string after a partial unicode escape
|
||||
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "Cutting back to last : for object key+value\n");
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
// handle unclosed top-level primitive
|
||||
if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;
|
||||
if (can_parse(str + "\"")) {
|
||||
// Was inside an string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
|
||||
// Was inside an string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
|
||||
} else {
|
||||
// TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
||||
// fprintf(stderr, "Closing: TODO\n");
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(it, end);
|
||||
it = end;
|
||||
return true;
|
||||
}
|
||||
@@ -1,39 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// TODO: use json_fwd.hpp when possible
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||
struct common_healing_marker {
|
||||
// Raw marker.
|
||||
std::string marker;
|
||||
|
||||
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
||||
std::string json_dump_marker;
|
||||
};
|
||||
|
||||
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
||||
struct common_json {
|
||||
nlohmann::ordered_json json;
|
||||
|
||||
common_healing_marker healing_marker;
|
||||
};
|
||||
|
||||
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
||||
//
|
||||
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
||||
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
||||
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
||||
//
|
||||
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
|
||||
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
@@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"DbrxForCausalLM": "dbrx",
|
||||
"DeciLMForCausalLM": "deci",
|
||||
"DeepseekForCausalLM": "deepseek",
|
||||
"DeepseekOCRForCausalLM": "deepseek",
|
||||
"DeepseekV2ForCausalLM": "deepseek",
|
||||
"DeepseekV3ForCausalLM": "deepseek",
|
||||
"DeepseekV32ForCausalLM": "deepseek",
|
||||
@@ -124,6 +125,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LLaDAModelLM": "llada",
|
||||
"LLaMAForCausalLM": "llama",
|
||||
"Lfm25AudioTokenizer": "lfm2",
|
||||
"Lfm2BidirectionalModel": "lfm2",
|
||||
"Lfm2ForCausalLM": "lfm2",
|
||||
"Lfm2Model": "lfm2",
|
||||
"Lfm2MoeForCausalLM": "lfm2",
|
||||
@@ -232,6 +234,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
"UnlimitedOCRForCausalLM": "deepseek",
|
||||
"VLlama3ForCausalLM": "llama",
|
||||
"VoxtralForConditionalGeneration": "llama",
|
||||
"WavTokenizerDec": "wavtokenizer",
|
||||
@@ -298,6 +301,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"UltravoxModel": "ultravox",
|
||||
"UnlimitedOCRForCausalLM": "deepseek",
|
||||
"VoxtralForConditionalGeneration": "ultravox",
|
||||
"YoutuVLForConditionalGeneration": "youtuvl",
|
||||
}
|
||||
|
||||
+10
-2
@@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
|
||||
from .qwen import QwenModel
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
|
||||
class DeepseekOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -205,6 +205,8 @@ class DeepseekModel(TextModel):
|
||||
@ModelBase.register(
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekOCRForCausalLM",
|
||||
"UnlimitedOCRForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"KimiK25ForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
@@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel):
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# special handling for Deepseek OCR
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
|
||||
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
@@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
# Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
|
||||
if is_ocr:
|
||||
sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
|
||||
if sliding_window:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
|
||||
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
|
||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||
|
||||
+10
-3
@@ -64,11 +64,17 @@ class LFM2Model(TextModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2Model")
|
||||
@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
|
||||
class LFM2ColBertModel(LFM2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
dense_tensor_name = "dense_2"
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.hf_arch == "Lfm2BidirectionalModel":
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
self._try_set_pooling_type()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith(self.dense_tensor_name):
|
||||
name = "model." + name
|
||||
@@ -76,10 +82,11 @@ class LFM2ColBertModel(LFM2Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# dense tensor is stored in a separate safetensors file
|
||||
# optional dense tensor is stored in a separate safetensors file
|
||||
from safetensors.torch import load_file
|
||||
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
|
||||
assert tensors_file.is_file()
|
||||
if not tensors_file.is_file():
|
||||
return
|
||||
tensor = load_file(tensors_file)["linear.weight"]
|
||||
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
|
||||
yield f"{self.dense_tensor_name}.weight", tensor.clone()
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
@@ -47,7 +46,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
@@ -73,7 +71,6 @@
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "OFF",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
@@ -266,7 +266,6 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||
"ggml: OpenCL API version to target")
|
||||
|
||||
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
||||
|
||||
# toolchain for vulkan-shaders-gen
|
||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||
|
||||
+50
-23
@@ -3688,8 +3688,6 @@ static void ggml_compute_forward_norm_f32(
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
@@ -3703,25 +3701,49 @@ static void ggml_compute_forward_norm_f32(
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||
char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
|
||||
|
||||
float sum = 0.0;
|
||||
ggml_vec_sum_f32(ne00, &sum, x);
|
||||
float mean = sum/ne00;
|
||||
if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
|
||||
const float * xf = (const float *) x;
|
||||
|
||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
float variance = 0;
|
||||
float sum = 0.0;
|
||||
ggml_vec_sum_f32(ne00, &sum, xf);
|
||||
float mean = sum/ne00;
|
||||
|
||||
float * yf = (float *) y;
|
||||
float variance = 0;
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
mean = -mean;
|
||||
vDSP_vsadd(x, 1, &mean, y, 1, ne00);
|
||||
vDSP_measqv(y, 1, &variance, ne00);
|
||||
mean = -mean;
|
||||
vDSP_vsadd(xf, 1, &mean, yf, 1, ne00);
|
||||
vDSP_measqv(yf, 1, &variance, ne00);
|
||||
#else
|
||||
variance = ggml_vec_cvar_f32(ne00, y, x, mean);
|
||||
variance = ggml_vec_cvar_f32(ne00, yf, xf, mean);
|
||||
#endif //GGML_USE_ACCELERATE
|
||||
|
||||
const float scale = 1.0f/sqrtf(variance + eps);
|
||||
ggml_vec_scale_f32(ne00, y, scale);
|
||||
const float scale = 1.0f/sqrtf(variance + eps);
|
||||
ggml_vec_scale_f32(ne00, yf, scale);
|
||||
} else {
|
||||
float sum = 0.0;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
sum += *(const float *) (x + i00*nb00);
|
||||
}
|
||||
const float mean = sum/ne00;
|
||||
|
||||
float variance = 0.0f;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
const float v = *(const float *) (x + i00*nb00) - mean;
|
||||
*(float *) (y + i00*nb0) = v;
|
||||
variance += v * v;
|
||||
}
|
||||
variance /= ne00;
|
||||
|
||||
const float scale = 1.0f/sqrtf(variance + eps);
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
*(float *) (y + i00*nb0) *= scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4142,8 +4164,6 @@ static void ggml_compute_forward_l2_norm_f32(
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
@@ -4158,20 +4178,27 @@ static void ggml_compute_forward_l2_norm_f32(
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
sum += (ggml_float)(x[i00] * x[i00]);
|
||||
const float xi = *(const float *) (x + i00*nb00);
|
||||
sum += (ggml_float)(xi * xi);
|
||||
}
|
||||
|
||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
memcpy(y, x, ne00 * sizeof(float));
|
||||
|
||||
const float scale = 1.0f/fmaxf(sqrtf(sum), eps);
|
||||
|
||||
ggml_vec_scale_f32(ne00, y, scale);
|
||||
char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
|
||||
|
||||
if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
|
||||
memcpy(y, x, ne00 * sizeof(float));
|
||||
ggml_vec_scale_f32(ne00, (float *) y, scale);
|
||||
} else {
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
const float xi = *(const float *) (x + i00*nb00);
|
||||
*(float *) (y + i00*nb0) = xi * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5334,7 +5334,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return true;
|
||||
return ggml_is_contiguous_rows(op->src[0]);
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
break;
|
||||
|
||||
@@ -25,7 +25,6 @@ include(ExternalProject)
|
||||
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
|
||||
option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
|
||||
set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
|
||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
|
||||
|
||||
add_library(htp_iface OBJECT
|
||||
${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
|
||||
@@ -72,15 +71,12 @@ function(build_htp_skel V)
|
||||
-DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
|
||||
-DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
|
||||
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
|
||||
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
|
||||
-DDSP_VERSION=${V}
|
||||
-DPREBUILT_LIB_DIR="toolv19_${V}")
|
||||
list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
|
||||
set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
build_htp_skel(v68)
|
||||
build_htp_skel(v69)
|
||||
build_htp_skel(v73)
|
||||
build_htp_skel(v75)
|
||||
build_htp_skel(v79)
|
||||
|
||||
+1359
-1274
File diff suppressed because it is too large
Load Diff
@@ -5,10 +5,12 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdio.h>
|
||||
#include "htp-ops.h"
|
||||
#include "htp/matmul-ops.h"
|
||||
|
||||
struct htp_opnode {
|
||||
ggml_tensor * node = nullptr;
|
||||
@@ -17,6 +19,13 @@ struct htp_opnode {
|
||||
|
||||
htp_op_code opcode = HTP_OP_INVALID;
|
||||
|
||||
std::vector<ggml_tensor *> extra_dsts;
|
||||
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0};
|
||||
|
||||
htp_opnode(ggml_tensor * node = nullptr, std::vector<ggml_tensor *> fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector<ggml_tensor *> extra_dsts = {})
|
||||
: node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {}
|
||||
|
||||
ggml_op op() const {
|
||||
return node->op;
|
||||
}
|
||||
@@ -25,6 +34,26 @@ struct htp_opnode {
|
||||
return fused.empty() ? node : fused.back();
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t, bool extra_dst = false) {
|
||||
fused.push_back(t);
|
||||
if (extra_dst) {
|
||||
extra_dsts.push_back(t);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const ggml_tensor *> get_outputs() const {
|
||||
std::vector<const ggml_tensor *> res;
|
||||
if (extra_dsts.empty()) {
|
||||
res.push_back(dst());
|
||||
} else {
|
||||
res.push_back(node);
|
||||
for (const auto * x : extra_dsts) {
|
||||
res.push_back(x);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
const ggml_tensor * src0() const {
|
||||
return node->src[0];
|
||||
}
|
||||
@@ -37,10 +66,6 @@ struct htp_opnode {
|
||||
return ggml_op_is_empty(node->op);
|
||||
}
|
||||
|
||||
void add_fused(ggml_tensor * t) {
|
||||
fused.push_back(t);
|
||||
}
|
||||
|
||||
bool stackable() const {
|
||||
switch (this->op()) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -131,87 +156,117 @@ struct htp_opformat {
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
char kparams[128];
|
||||
|
||||
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
|
||||
int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
return snprintf(str, max_size, "NONE");
|
||||
}
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
|
||||
} else {
|
||||
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_dims(char * str, const htp_opnode & node) {
|
||||
void format_op_dims(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_dims(p, inputs[0]);
|
||||
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_dims(p, inputs[i]);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_dims(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
format_tensor_dims(self, sizeof(self), node.dst());
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
|
||||
int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) {
|
||||
if (!t) {
|
||||
return sprintf(str, "NONE");
|
||||
return snprintf(str, max_size, "NONE");
|
||||
}
|
||||
const char * c = ggml_is_contiguous(t) ? "" : "!";
|
||||
|
||||
if (t->ne[2] == 1 && t->ne[3] == 1) {
|
||||
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
|
||||
} else {
|
||||
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_strides(char * str, const htp_opnode & node) {
|
||||
void format_op_strides(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += format_tensor_strides(p, inputs[0]);
|
||||
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += format_tensor_strides(p, inputs[i]);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
char self[64];
|
||||
format_tensor_strides(self, node.dst());
|
||||
p += sprintf(p, "%s", self);
|
||||
format_tensor_strides(self, sizeof(self), node.dst());
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_types(char * str, const htp_opnode & node) {
|
||||
void format_op_types(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
const char * tensor_buff_name(const struct ggml_tensor * t) {
|
||||
@@ -221,51 +276,102 @@ struct htp_opformat {
|
||||
return "NONE";
|
||||
}
|
||||
|
||||
void format_op_buffs(char * str, const htp_opnode & node) {
|
||||
void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", tensor_buff_name(node.dst()));
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
void format_op_names(char * str, const htp_opnode & node) {
|
||||
void format_op_names(char * str, size_t max_size, const htp_opnode & node) {
|
||||
char * p = str;
|
||||
char * p_end = str + max_size;
|
||||
auto inputs = node.get_inputs();
|
||||
|
||||
if (!inputs.empty()) {
|
||||
p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
|
||||
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
p += sprintf(p, " x ");
|
||||
p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
|
||||
p += sprintf(p, " -> ");
|
||||
for (size_t i = 1; i < inputs.size(); i++) {
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
|
||||
}
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
|
||||
p += sprintf(p, "%s", node.dst()->name);
|
||||
if (p < p_end) {
|
||||
p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p));
|
||||
}
|
||||
}
|
||||
void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) {
|
||||
if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID ||
|
||||
node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) {
|
||||
const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params;
|
||||
const char * path = "unknown";
|
||||
int32_t type = kparams->kernel_type;
|
||||
if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) {
|
||||
path = "hmx-tiled";
|
||||
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM ||
|
||||
type == HTP_MM_KERNEL_HVX_QUANT_ROW || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) {
|
||||
path = "hvx-tiled";
|
||||
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR || type == HTP_MM_KERNEL_HVX_F16_F32_DDR ||
|
||||
type == HTP_MM_KERNEL_HVX_F32_F32_DDR || type == HTP_MM_KERNEL_HVX_F32_F16_DDR ||
|
||||
type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) {
|
||||
path = "hvx-flat";
|
||||
}
|
||||
snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size);
|
||||
} else {
|
||||
snprintf(str, max_size, "----");
|
||||
}
|
||||
}
|
||||
|
||||
void format(const htp_opnode & node) {
|
||||
format_op_dims(dims, node);
|
||||
format_op_strides(strides, node);
|
||||
format_op_types(types, node);
|
||||
format_op_buffs(buffs, node);
|
||||
format_op_names(names, node);
|
||||
format_op_dims(dims, sizeof(dims), node);
|
||||
format_op_strides(strides, sizeof(strides), node);
|
||||
format_op_types(types, sizeof(types), node);
|
||||
format_op_buffs(buffs, sizeof(buffs), node);
|
||||
format_op_names(names, sizeof(names), node);
|
||||
format_kernel_params(kparams, sizeof(kparams), node);
|
||||
}
|
||||
|
||||
htp_opformat() {}
|
||||
htp_opformat() {
|
||||
strides[0] = '\0';
|
||||
dims[0] = '\0';
|
||||
types[0] = '\0';
|
||||
buffs[0] = '\0';
|
||||
names[0] = '\0';
|
||||
kparams[0] = '\0';
|
||||
}
|
||||
htp_opformat(const htp_opnode & node) { format(node); }
|
||||
};
|
||||
|
||||
|
||||
@@ -19,43 +19,9 @@ add_library(${HTP_LIB} SHARED
|
||||
htp_iface_skel.c
|
||||
worker-pool.c
|
||||
hex-dma.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
|
||||
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
# HMX acceleration: available on v73+ architectures
|
||||
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
|
||||
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-queue.c
|
||||
flash-attn-ops.c
|
||||
hmx-flash-attn-ops.c
|
||||
matmul-ops.c
|
||||
binary-ops.c
|
||||
unary-ops.c
|
||||
@@ -63,7 +29,6 @@ target_sources(${HTP_LIB} PRIVATE
|
||||
softmax-ops.c
|
||||
act-ops.c
|
||||
rope-ops.c
|
||||
flash-attn-ops.c
|
||||
set-rows-ops.c
|
||||
get-rows-ops.c
|
||||
cpy-ops.c
|
||||
@@ -79,6 +44,17 @@ target_sources(${HTP_LIB} PRIVATE
|
||||
pad-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
|
||||
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>)
|
||||
|
||||
if (GGML_HEXAGON_FA_EXP2_HF)
|
||||
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
|
||||
endif()
|
||||
|
||||
build_idl(htp_iface.idl ${HTP_LIB})
|
||||
|
||||
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
install(TARGETS ${HTP_LIB})
|
||||
|
||||
@@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED)
|
||||
endif()
|
||||
set(HEXAGON_TOOLCHAIN_INCLUDED true)
|
||||
|
||||
#Cross Compiling for Hexagon
|
||||
# Cross Compiling for Hexagon
|
||||
set(HEXAGON TRUE)
|
||||
set(CMAKE_SYSTEM_NAME QURT)
|
||||
set(CMAKE_SYSTEM_PROCESSOR Hexagon)
|
||||
@@ -14,7 +14,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CUSTOM_RUNELF_PATH "")
|
||||
|
||||
#To fix backward compatibility with EAI addon.
|
||||
if (NOT HEXAGON_SDK_ROOT)
|
||||
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
|
||||
endif()
|
||||
@@ -31,7 +30,6 @@ endif()
|
||||
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
||||
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
|
||||
|
||||
#Get the Binary extension of the Hexagon Toolchain
|
||||
if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
|
||||
set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
|
||||
endif()
|
||||
@@ -48,12 +46,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
|
||||
HEXAGON_TOOLS_ROOT
|
||||
)
|
||||
|
||||
#QURT Related includes and linker flags
|
||||
# QURT Related includes and linker flags
|
||||
set(V_ARCH ${HEXAGON_ARCH})
|
||||
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
|
||||
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
|
||||
|
||||
if( ${TREE} MATCHES PAKMAN )
|
||||
if (${TREE} MATCHES PAKMAN)
|
||||
set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
|
||||
endif()
|
||||
message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
|
||||
@@ -83,11 +81,9 @@ set(QURT_START_LINK_LIBS
|
||||
)
|
||||
STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
|
||||
|
||||
set(QURT_END_LINK_LIBS
|
||||
${TARGET_DIR}/fini.o
|
||||
)
|
||||
set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)
|
||||
|
||||
#Non QURT related includes and linker flags
|
||||
# Non QURT related includes and linker flags
|
||||
|
||||
set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
|
||||
|
||||
@@ -99,8 +95,10 @@ if (NOT NO_WRAP_MEM_API)
|
||||
set(WRAP_MEMALIGN -Wl,--wrap=memalign)
|
||||
endif()
|
||||
|
||||
set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
|
||||
|
||||
set(PIC_SHARED_LD_FLAGS
|
||||
-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
|
||||
${ARCH_FLAGS}
|
||||
-G0
|
||||
-fpic
|
||||
-Wl,-Bsymbolic
|
||||
@@ -120,13 +118,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
|
||||
|
||||
set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
|
||||
|
||||
#System include paths
|
||||
# System include paths
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
|
||||
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
|
||||
|
||||
#LLVM toolchain setup
|
||||
#Compiler paths, options and architecture
|
||||
# LLVM toolchain setup
|
||||
# Compiler paths, options and architecture
|
||||
set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
|
||||
@@ -137,8 +135,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
|
||||
set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,")
|
||||
set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
|
||||
|
||||
#Compiler Options
|
||||
set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
|
||||
# Compiler Options
|
||||
set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
|
||||
|
||||
@@ -18,7 +18,8 @@
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hmx-ops.h"
|
||||
|
||||
int hmx_flash_attn_ext(struct htp_ops_context * octx);
|
||||
|
||||
// Must be multiple of 32
|
||||
#define FLASH_ATTN_BLOCK_SIZE (32 * 2)
|
||||
@@ -633,7 +634,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
// HMX path: head_dim multiple of 64, F16 KV, and no sinks
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
|
||||
int ret = hmx_flash_attn_ext(octx);
|
||||
@@ -642,7 +642,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
}
|
||||
// VTCM too small or other failure -> fall through to HVX path
|
||||
}
|
||||
#endif
|
||||
|
||||
struct htp_fa_context factx;
|
||||
factx.octx = octx;
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
#ifndef HEX_COMMON_H
|
||||
#define HEX_COMMON_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef SIZE_MAX
|
||||
#define SIZE_MAX ((size_t)-1)
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
||||
return ((size_t) addr & (align - 1)) == 0;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_up(size_t v, size_t align) {
|
||||
return hmx_ceil_div(v, align) * align;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_down(size_t v, size_t align) {
|
||||
return (v / align) * align;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
||||
return m * ((n + m - 1) / m);
|
||||
}
|
||||
|
||||
static inline size_t hex_smin(size_t a, size_t b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline size_t hex_smax(size_t a, size_t b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline void hex_swap_ptr(void ** p1, void ** p2) {
|
||||
void * t = *p1;
|
||||
*p1 = *p2;
|
||||
*p2 = t;
|
||||
}
|
||||
|
||||
static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) {
|
||||
if (a != 0 && b > SIZE_MAX / a) return true;
|
||||
*out = a * b;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) {
|
||||
if (a > SIZE_MAX - b) return true;
|
||||
*out = a + b;
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // HEX_COMMON_H
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <hexagon_types.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "hex-utils.h"
|
||||
|
||||
#include "hex-profile.h"
|
||||
|
||||
@@ -127,13 +128,8 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
|
||||
return p;
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ < 73
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 0;
|
||||
#else
|
||||
static const uint32_t dma_src_l2_bypass_on = 1;
|
||||
static const uint32_t dma_dst_l2_bypass_on = 1;
|
||||
#endif
|
||||
|
||||
static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
|
||||
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
|
||||
|
||||
@@ -11,14 +11,7 @@
|
||||
|
||||
#include "hex-fastdiv.h"
|
||||
#include "hex-dump.h"
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
#include "hex-common.h"
|
||||
|
||||
static inline uint64_t hex_get_cycles() {
|
||||
uint64_t cycles = 0;
|
||||
@@ -32,54 +25,6 @@ static inline uint64_t hex_get_pktcnt() {
|
||||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
|
||||
return ((size_t) addr & (align - 1)) == 0;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_up(size_t v, size_t align) {
|
||||
return hmx_ceil_div(v, align) * align;
|
||||
}
|
||||
|
||||
static inline size_t hex_align_down(size_t v, size_t align) {
|
||||
return (v / align) * align;
|
||||
}
|
||||
|
||||
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
|
||||
uint32_t left_off = (size_t) addr & (chunk_size - 1);
|
||||
uint32_t right_off = left_off + n;
|
||||
return right_off <= chunk_size;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
|
||||
return m * ((n + m - 1) / m);
|
||||
}
|
||||
|
||||
static inline size_t hex_smin(size_t a, size_t b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline size_t hex_smax(size_t a, size_t b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline void hex_swap_ptr(void ** p1, void ** p2) {
|
||||
void * t = *p1;
|
||||
*p1 = *p2;
|
||||
*p2 = t;
|
||||
}
|
||||
|
||||
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
|
||||
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
|
||||
Q6_l2fetch_AP((void *) p, control);
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
// g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
|
||||
// Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
|
||||
// Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
|
||||
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) {
|
||||
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
|
||||
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
|
||||
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
|
||||
@@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
|
||||
+ k_dma_size * 2 // K DMA x2
|
||||
+ v_dma_size * 2 // V DMA x2
|
||||
+ k_tile_size * 1 // K tiles
|
||||
+ v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
|
||||
+ v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
|
||||
+ s_tile_size * 2 // S + P
|
||||
+ d_tile_size * 1 // D (diagonal matrix)
|
||||
+ col_vec_size * 4 // m_vec, l_vec, s_rowmax, p_rowsum
|
||||
@@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) =
|
||||
|
||||
struct hmx_fa_context {
|
||||
const struct htp_ops_context * octx;
|
||||
bool use_pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
|
||||
bool pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
|
||||
uint32_t n_threads;
|
||||
|
||||
// Op parameters
|
||||
@@ -409,7 +409,7 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
|
||||
return;
|
||||
}
|
||||
|
||||
__fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
|
||||
__fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
|
||||
|
||||
struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
|
||||
htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
|
||||
@@ -1312,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
const uint32_t n_kv_blocks = (nek1 + Bc - 1) / Bc;
|
||||
const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
|
||||
const bool pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
|
||||
|
||||
// Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
|
||||
const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
|
||||
const uint32_t n_threads = pipeline ? n_threads_init : 1;
|
||||
|
||||
FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
|
||||
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
|
||||
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget);
|
||||
|
||||
// ======== Build context ========
|
||||
struct hmx_fa_context factx;
|
||||
@@ -1339,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.n_kv_blocks = n_kv_blocks;
|
||||
factx.is_q_fp32 = (q->type == HTP_TYPE_F32);
|
||||
factx.is_dst_fp32 = (dst->type == HTP_TYPE_F32);
|
||||
factx.use_pipeline = use_pipeline;
|
||||
factx.pipeline = pipeline;
|
||||
factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1);
|
||||
|
||||
// Extract op parameters (mutable during softcap adjustment, then stored as const in factx)
|
||||
@@ -1405,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.vtcm_v_fp16[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
|
||||
factx.vtcm_k_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
|
||||
factx.vtcm_v_tiles[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
if (use_pipeline) {
|
||||
if (pipeline) {
|
||||
factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
|
||||
} else {
|
||||
factx.vtcm_v_tiles[1] = NULL;
|
||||
@@ -1456,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
// ======== HMX lock strategy ========
|
||||
// Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend.
|
||||
// Fallback: main thread holds the lock (original behavior).
|
||||
if (!factx.use_pipeline) {
|
||||
if (!factx.pipeline) {
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
}
|
||||
|
||||
@@ -1550,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
const size_t k_src_stride = size_k_row_padded / sizeof(__fp16);
|
||||
const size_t v_src_stride = size_v_row_padded / sizeof(__fp16);
|
||||
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
// ==================================================================
|
||||
// Pipeline path: HVX phases ‖ HMX queue worker
|
||||
// ==================================================================
|
||||
@@ -1780,7 +1780,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);
|
||||
|
||||
// HMX: O_final = diag(1/l) @ O_prev
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
on_job.o_curr = o_tile_curr;
|
||||
on_job.o_prev = o_tile_prev;
|
||||
on_job.d_tiles = factx.vtcm_d_tiles;
|
||||
@@ -1826,7 +1826,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
} // end KV head loop
|
||||
} // end batch loop
|
||||
|
||||
if (factx.use_pipeline) {
|
||||
if (factx.pipeline) {
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
} else {
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,6 +0,0 @@
|
||||
// HMX operations compiled as a single translation unit.
|
||||
// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
|
||||
|
||||
#include "hmx-queue.c"
|
||||
#include "hmx-matmul-ops.c"
|
||||
#include "hmx-flash-attn-ops.c"
|
||||
@@ -1,88 +0,0 @@
|
||||
// HMX operation entry-point declarations.
|
||||
// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
|
||||
|
||||
#ifndef HMX_OPS_H
|
||||
#define HMX_OPS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "htp-ops.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
float *dst;
|
||||
const float *activation;
|
||||
const __fp16 *permuted_weight;
|
||||
int m;
|
||||
int k;
|
||||
int n;
|
||||
int act_stride;
|
||||
int weight_stride;
|
||||
int dst_stride;
|
||||
int ne02;
|
||||
int ne03;
|
||||
int ne12;
|
||||
int ne13;
|
||||
size_t src0_nb2;
|
||||
size_t src0_nb3;
|
||||
size_t src1_nb2;
|
||||
size_t src1_nb3;
|
||||
size_t dst_nb2;
|
||||
size_t dst_nb3;
|
||||
} hmx_matmul_f16_f32_batched_params_t;
|
||||
|
||||
// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
|
||||
// act_stride: activation row stride in elements (= k for contiguous, or
|
||||
// nb[1]/sizeof(float) for permuted tensors like attention Q).
|
||||
// weight_stride: weight row stride in elements (= k for compact weights, or
|
||||
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
|
||||
int hmx_matmul_f16_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const __fp16 *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride);
|
||||
|
||||
// Batched F16 wrapper over hmx_mat_mul_f16_f32.
|
||||
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
|
||||
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
|
||||
|
||||
// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
|
||||
int hmx_matmul_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int act_stride,
|
||||
int weight_stride,
|
||||
int weight_type);
|
||||
|
||||
struct mmid_row_mapping;
|
||||
|
||||
int hmx_matmul_id_2d_f32(struct htp_context *ctx,
|
||||
float *restrict dst,
|
||||
const float *activation,
|
||||
const uint8_t *permuted_weight,
|
||||
int m, int k, int n,
|
||||
int ne11,
|
||||
size_t act_nb1, size_t act_nb2,
|
||||
size_t dst_nb1, size_t dst_nb2,
|
||||
int weight_stride,
|
||||
int weight_type,
|
||||
const struct mmid_row_mapping *matrix_rows,
|
||||
int cur_a,
|
||||
int mapping_stride);
|
||||
|
||||
// HMX flash attention
|
||||
int hmx_flash_attn_ext(struct htp_ops_context * octx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HMX_OPS_H
|
||||
@@ -13,7 +13,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef HTP_MAX_NTHREADS
|
||||
#define HTP_MAX_NTHREADS 10
|
||||
#endif
|
||||
#define HTP_MAX_MMAPS 16
|
||||
|
||||
// Memory mapping
|
||||
@@ -42,9 +44,13 @@ struct htp_ops_context {
|
||||
|
||||
enum htp_op_code op; // FIXME: rename to opcode
|
||||
int32_t op_params[HTP_OP_MAX_PARAMS];
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS];
|
||||
|
||||
const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
|
||||
const struct htp_tensor * dst;
|
||||
union {
|
||||
const struct htp_tensor * dst;
|
||||
const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS];
|
||||
};
|
||||
|
||||
// TODO convert these to an array
|
||||
struct htp_spad src0_spad;
|
||||
@@ -87,13 +93,13 @@ struct htp_context {
|
||||
|
||||
struct htp_ops_context octx;
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
|
||||
#endif
|
||||
};
|
||||
|
||||
int op_matmul(struct htp_ops_context * octx);
|
||||
int op_matmul_id(struct htp_ops_context * octx);
|
||||
int op_matmul_qkv(struct htp_ops_context * octx);
|
||||
int op_matmul_ffn(struct htp_ops_context * octx);
|
||||
int op_binary(struct htp_ops_context * octx);
|
||||
int op_unary(struct htp_ops_context * octx);
|
||||
int op_sum_rows(struct htp_ops_context * octx);
|
||||
|
||||
@@ -28,18 +28,19 @@ enum htp_data_type {
|
||||
HTP_TYPE_MXFP4 = 39,
|
||||
|
||||
// types used internally for repack, dyn.quant, etc
|
||||
HTP_TYPE_Q4_0x4x2 = 200,
|
||||
HTP_TYPE_Q4_1x4x2,
|
||||
HTP_TYPE_Q8_0x4x2,
|
||||
HTP_TYPE_MXFP4x4x2,
|
||||
HTP_TYPE_Q4_0_TILED = 200,
|
||||
HTP_TYPE_Q4_1_TILED,
|
||||
HTP_TYPE_Q8_0_TILED,
|
||||
HTP_TYPE_MXFP4_TILED,
|
||||
|
||||
HTP_TYPE_INVALID
|
||||
};
|
||||
|
||||
// Constats for internal types
|
||||
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
|
||||
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
|
||||
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
|
||||
#define QK_Q4_0_TILED 256 // 32x32 Q4_0 tiled layout
|
||||
#define QK_Q8_0_TILED 128 // 32x32 Q8_0 tiled layout
|
||||
#define QK_MXFP4_TILED 256 // 32x32 MXFP4 tiled layout
|
||||
|
||||
|
||||
|
||||
// Mask to enable various stages of the Ops.
|
||||
@@ -57,6 +58,8 @@ enum htp_op_code {
|
||||
HTP_OP_DIV = 3,
|
||||
HTP_OP_MUL_MAT,
|
||||
HTP_OP_MUL_MAT_ID,
|
||||
HTP_OP_MUL_MAT_QKV,
|
||||
HTP_OP_MUL_MAT_FFN,
|
||||
HTP_OP_RMS_NORM,
|
||||
HTP_OP_RMS_NORM_MUL,
|
||||
HTP_OP_UNARY_SILU,
|
||||
@@ -99,7 +102,9 @@ enum htp_op_code {
|
||||
|
||||
#define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS
|
||||
#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS
|
||||
#define HTP_OP_MAX_OUTPUTS 4
|
||||
#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS
|
||||
#define HTP_OP_MAX_KERN_PARAMS 32
|
||||
|
||||
#define HTP_OP_MAX_BUFS 16
|
||||
#define HTP_OP_MAX_REQS 256
|
||||
@@ -142,8 +147,10 @@ struct htp_op_desc {
|
||||
uint32_t opcode; // GGML/HTP Op
|
||||
uint32_t flags; // Op flags
|
||||
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
|
||||
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters
|
||||
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
|
||||
uint16_t dst; // Output tensor index
|
||||
uint16_t dst[HTP_OP_MAX_OUTPUTS]; // Output tensor indices
|
||||
uint16_t pad[2]; // padding to align to 64 bits
|
||||
};
|
||||
|
||||
#ifndef HTP_MAX_NTHREADS
|
||||
|
||||
@@ -11,12 +11,13 @@ struct htp_iface_pmu_conf {
|
||||
};
|
||||
|
||||
interface htp_iface : remote_handle64 {
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem);
|
||||
AEEResult stop();
|
||||
AEEResult mmap(in uint32 fd, in uint32 size);
|
||||
AEEResult munmap(in uint32 fd);
|
||||
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
|
||||
AEEResult etm(in uint32 enable);
|
||||
AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size);
|
||||
};
|
||||
|
||||
#endif /* HTP_IDL */
|
||||
|
||||
@@ -170,25 +170,7 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
||||
#if __HVX_ARCH__ < 73
|
||||
static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
|
||||
{
|
||||
HVX_Vector const vzero = Q6_V_vzero();
|
||||
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
||||
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
||||
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
||||
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
||||
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
||||
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
||||
{
|
||||
return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
|
||||
// This looks complicated.
|
||||
@@ -305,4 +287,17 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
|
||||
#endif // __HVX_ARCH__ < 79
|
||||
|
||||
static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) {
|
||||
if (kt % 4 == 0) {
|
||||
*v_act_all = hvx_vmem(y_q + kt * 32);
|
||||
return *v_act_all;
|
||||
} else if (kt % 4 == 1) {
|
||||
return Q6_V_vror_VR(*v_act_all, 32);
|
||||
} else if (kt % 4 == 2) {
|
||||
return Q6_V_vror_VR(*v_act_all, 64);
|
||||
} else {
|
||||
return Q6_V_vror_VR(*v_act_all, 96);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HVX_BASE_H */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
|
||||
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
|
||||
static void htp_error_callback(dspqueue_t queue, int error, void * context);
|
||||
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
|
||||
AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
|
||||
if (!ctx) {
|
||||
@@ -395,10 +395,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = use_hmx;
|
||||
ctx->hmx_enabled = n_hmx;
|
||||
ctx->hmx_queue = NULL;
|
||||
if (use_hmx) {
|
||||
if (n_hmx) {
|
||||
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
|
||||
if (ctx->hmx_queue) {
|
||||
ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
|
||||
@@ -407,8 +406,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
ctx->hmx_enabled = false;
|
||||
}
|
||||
}
|
||||
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
|
||||
#endif
|
||||
FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx);
|
||||
|
||||
qurt_sysenv_max_hthreads_t hw_threads;
|
||||
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
||||
@@ -481,13 +479,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
dma_queue_delete(ctx->dma[i]);
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
if (ctx->hmx_queue) {
|
||||
hmx_queue_delete(ctx->hmx_queue);
|
||||
ctx->hmx_queue = NULL;
|
||||
}
|
||||
ctx->hmx_enabled = false;
|
||||
#endif
|
||||
|
||||
vtcm_free(ctx);
|
||||
|
||||
@@ -500,6 +496,36 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) {
|
||||
(void)handle;
|
||||
if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) {
|
||||
return AEE_EBADPARM;
|
||||
}
|
||||
|
||||
qurt_sysenv_max_hthreads_t hw_threads;
|
||||
qurt_sysenv_get_max_hw_threads(&hw_threads);
|
||||
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
|
||||
|
||||
uint32_t n_hvx_val = hw_nhvx;
|
||||
if (n_hvx_val > hw_threads.max_hthreads) {
|
||||
n_hvx_val = hw_threads.max_hthreads;
|
||||
}
|
||||
if (n_hvx_val > HTP_MAX_NTHREADS) {
|
||||
n_hvx_val = HTP_MAX_NTHREADS;
|
||||
}
|
||||
|
||||
// for now we force n_threads == n_hvx
|
||||
*n_threads = n_hvx_val;
|
||||
*n_hvx = n_hvx_val;
|
||||
*n_hmx = 1;
|
||||
|
||||
uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback
|
||||
HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL);
|
||||
*vtcm_size = vtcm_sz;
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
static void htp_error_callback(dspqueue_t queue, int error, void * context) {
|
||||
// No errors expected on the DSP.
|
||||
FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
|
||||
@@ -554,6 +580,12 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_MUL_MAT_ID:
|
||||
return op_matmul_id(octx);
|
||||
|
||||
case HTP_OP_MUL_MAT_QKV:
|
||||
return op_matmul_qkv(octx);
|
||||
|
||||
case HTP_OP_MUL_MAT_FFN:
|
||||
return op_matmul_ffn(octx);
|
||||
|
||||
case HTP_OP_MUL:
|
||||
case HTP_OP_ADD:
|
||||
case HTP_OP_SUB:
|
||||
@@ -762,8 +794,9 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str
|
||||
}
|
||||
}
|
||||
|
||||
static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
|
||||
static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
|
||||
memcpy(octx->op_params, op->params, sizeof(octx->op_params));
|
||||
memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params));
|
||||
octx->flags = op->flags;
|
||||
octx->op = op->opcode;
|
||||
|
||||
@@ -785,22 +818,41 @@ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens,
|
||||
src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
|
||||
}
|
||||
|
||||
// Prep output tensor
|
||||
struct htp_tensor *dst = tens + op->dst;
|
||||
// Prep output tensors
|
||||
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
|
||||
uint16_t dst_idx = op->dst[i];
|
||||
if (dst_idx == 0xffff) {
|
||||
octx->dsts[i] = NULL;
|
||||
continue;
|
||||
}
|
||||
struct htp_tensor *dst = tens + dst_idx;
|
||||
octx->dsts[i] = dst;
|
||||
|
||||
octx->dst = dst;
|
||||
FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
}
|
||||
|
||||
FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
||||
int status = execute_op(octx);
|
||||
|
||||
(void) execute_op(octx);
|
||||
octx->src0_spad.src = NULL;
|
||||
octx->src1_spad.src = NULL;
|
||||
octx->src2_spad.src = NULL;
|
||||
octx->src3_spad.src = NULL;
|
||||
octx->dst_spad.src = NULL;
|
||||
|
||||
// flush buffers on output
|
||||
hex_l2flush((void *) dst->data, dst->size);
|
||||
dst->flags |= HTP_TENSOR_FLUSHED;
|
||||
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
|
||||
if (octx->dsts[i]) {
|
||||
struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i];
|
||||
hex_l2flush((void *) dst->data, dst->size);
|
||||
dst->flags |= HTP_TENSOR_FLUSHED;
|
||||
|
||||
FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
|
||||
FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define DSPQUEUE_POLL_TIMEOUT_USEC 100
|
||||
@@ -892,20 +944,26 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
}
|
||||
}
|
||||
|
||||
int op_status = HTP_STATUS_OK;
|
||||
uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch
|
||||
|
||||
for (uint32_t i=0; i < n_ops; i++) {
|
||||
struct profile_data prof;
|
||||
|
||||
if (i == (n_ops-1)) {
|
||||
// wake up the host before starting the last op
|
||||
if (i == op_wakeup) {
|
||||
dspqueue_write_early_wakeup_noblock(queue, 0, 0);
|
||||
}
|
||||
|
||||
profile_start(ctx->profiler, &prof);
|
||||
|
||||
proc_op_req(octx, tens, i, &ops[i]);
|
||||
op_status = proc_op_req(octx, tens, i, &ops[i]);
|
||||
|
||||
profile_stop(ctx->profiler, &prof);
|
||||
|
||||
if (op_status != HTP_STATUS_OK) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ctx->profiler) {
|
||||
pds[i].opcode = ops[i].opcode;
|
||||
pds[i].usecs = prof.usecs;
|
||||
@@ -919,7 +977,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
|
||||
struct htp_opbatch_rsp rsp;
|
||||
rsp.id = req.id;
|
||||
rsp.status = HTP_STATUS_OK;
|
||||
rsp.status = op_status;
|
||||
rsp.n_bufs = n_bufs;
|
||||
rsp.n_tensors = n_tens;
|
||||
rsp.n_ops = n_ops;
|
||||
|
||||
+2729
-4117
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,508 @@
|
||||
#ifndef HTP_MATMUL_OPS_H
|
||||
#define HTP_MATMUL_OPS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include "htp-ops.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include "hex-common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// --- HMX Tile Constraints ---
|
||||
#define HTP_MM_HMX_TILE_N_COLS 32
|
||||
#define HTP_MM_HMX_TILE_N_ROWS 32
|
||||
#define HTP_MM_HMX_TILE_SIZE (32 * 32 * sizeof(__fp16)) // 2048 bytes
|
||||
#define HTP_MM_HMX_TILE_N_ELMS 1024
|
||||
#define HTP_MM_HMX_MIN_NROWS 4
|
||||
|
||||
// --- Weight Repacked Tile Sizes ---
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0 576
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1 640
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0 1088
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576
|
||||
#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4 544
|
||||
|
||||
// --- Weight Repacked Aligned Tile Sizes ---
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0 1152
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640
|
||||
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4 640
|
||||
|
||||
// --- Activation Tiled Block Sizes (including padding) ---
|
||||
#define HTP_MM_ACT_TILE_SIZE_Q8_0 1152
|
||||
#define HTP_MM_ACT_TILE_SIZE_Q8_1 1280
|
||||
|
||||
#define HTP_MM_MAX_PREFETCH 16
|
||||
|
||||
// --- Solver Cost Model Penalty Weights (HMX-specific) ---
|
||||
#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization
|
||||
#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion
|
||||
|
||||
// --- DMA Activation Transfer Configuration ---
|
||||
#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2
|
||||
#define HTP_MM_DMA_ACT_MULTIPLIER 4
|
||||
|
||||
enum htp_mm_kernel_type {
|
||||
HTP_MM_KERNEL_UNSUPPORTED = 0,
|
||||
|
||||
// HMX paths
|
||||
HTP_MM_KERNEL_HMX_2D,
|
||||
HTP_MM_KERNEL_HMX_F16_BATCHED,
|
||||
|
||||
// HVX floating-point paths
|
||||
HTP_MM_KERNEL_HVX_F16_F16_VTCM,
|
||||
HTP_MM_KERNEL_HVX_F16_F16_DDR,
|
||||
HTP_MM_KERNEL_HVX_F16_F32_DDR,
|
||||
|
||||
HTP_MM_KERNEL_HVX_F32_F32_VTCM,
|
||||
HTP_MM_KERNEL_HVX_F32_F32_DDR,
|
||||
HTP_MM_KERNEL_HVX_F32_F16_DDR,
|
||||
|
||||
// HVX quantized paths
|
||||
HTP_MM_KERNEL_HVX_QUANT_ROW, // standard row-wise parallel quantization
|
||||
HTP_MM_KERNEL_HVX_QUANT_BLOCK, // parallel block-wise quantization
|
||||
HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization
|
||||
};
|
||||
|
||||
// Op-specific struct for precomputed matmul params
|
||||
struct htp_mm_kernel_params {
|
||||
int32_t kernel_type; // enum htp_mm_kernel_type
|
||||
int32_t pipeline; // 1 = pipelined execution, 0 = standard
|
||||
int32_t m_chunk; // Row chunk size (M chunk)
|
||||
int32_t n_chunk; // Col chunk size (N chunk)
|
||||
int32_t n_threads; // Number of threads to spawn
|
||||
int32_t n_act_threads; // Number of threads for activation preparation
|
||||
int32_t n_hmx; // 1 = use HMX, 0 = use HVX
|
||||
int32_t n_prefetch; // Prefetch lookahead buffers/rows in VTCM
|
||||
int32_t tile_size; // Weight tile size
|
||||
int32_t aligned_tile_size; // Aligned weight tile size (padded to 128)
|
||||
int32_t src1_row_size; // Row size for quantized activation
|
||||
int32_t vtcm_size; // Total required scratchpad size in VTCM
|
||||
int32_t vtcm_src0_size; // src0 scratchpad size in VTCM
|
||||
int32_t vtcm_src1_size; // src1 scratchpad size in VTCM
|
||||
int32_t vtcm_src2_size; // src2 scratchpad size in VTCM (fused only)
|
||||
int32_t vtcm_src3_size; // src3 scratchpad size in VTCM (fused only)
|
||||
int32_t vtcm_dst_size; // dst scratchpad size in VTCM
|
||||
|
||||
// Precomputed division values
|
||||
struct fastdiv_values div_ne12_ne1;
|
||||
struct fastdiv_values div_ne1;
|
||||
struct fastdiv_values div_r2;
|
||||
struct fastdiv_values div_r3;
|
||||
struct fastdiv_values div_ne11;
|
||||
};
|
||||
|
||||
#if defined(__cplusplus)
|
||||
static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
|
||||
#else
|
||||
_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
|
||||
#endif
|
||||
|
||||
struct mmid_row_mapping {
|
||||
uint32_t i1;
|
||||
uint32_t i2;
|
||||
};
|
||||
|
||||
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
|
||||
static inline int htp_mm_hmx_compute_chunks(size_t vtcm_total,
|
||||
size_t overhead,
|
||||
size_t per_n_cost,
|
||||
size_t per_m_cost,
|
||||
size_t per_mn_cost,
|
||||
size_t m,
|
||||
size_t n,
|
||||
size_t m_block_cost,
|
||||
size_t n_block_cost,
|
||||
size_t * m_chunk_out,
|
||||
size_t * n_chunk_out,
|
||||
size_t * total_out) {
|
||||
if (m == 0 || n == 0) return -1;
|
||||
if (vtcm_total <= overhead) return -1;
|
||||
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
|
||||
|
||||
const size_t usable = vtcm_total - overhead;
|
||||
|
||||
size_t best_cost = SIZE_MAX;
|
||||
size_t best_mn = 0;
|
||||
size_t best_m = 0, best_n = 0;
|
||||
|
||||
const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS);
|
||||
for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) {
|
||||
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
|
||||
if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
|
||||
if (n_fixed >= usable) goto next_nc;
|
||||
|
||||
if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc;
|
||||
if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc;
|
||||
|
||||
{
|
||||
size_t remain = usable - n_fixed;
|
||||
size_t mc = remain / mc_denom;
|
||||
mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS);
|
||||
mc = hex_smin(mc, m);
|
||||
|
||||
if (mc == 0) {
|
||||
goto next_nc;
|
||||
}
|
||||
|
||||
size_t mblocks = ((size_t) m + mc - 1) / mc;
|
||||
size_t nblocks = ((size_t) n + nc - 1) / nc;
|
||||
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
|
||||
size_t mn = mc * nc;
|
||||
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
|
||||
best_cost = cost;
|
||||
best_mn = mn;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
}
|
||||
}
|
||||
|
||||
next_nc:
|
||||
if (nc == HTP_MM_HMX_TILE_N_COLS) break; // avoid size_t underflow
|
||||
}
|
||||
|
||||
if (best_m == 0 || best_n == 0) return -1;
|
||||
|
||||
// Compute exact total (with overflow checks)
|
||||
size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0;
|
||||
if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1;
|
||||
if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1;
|
||||
if (hex_mul_overflow(best_m, best_n, &mn)) return -1;
|
||||
if (hex_mul_overflow(mn, per_mn_cost, &t2)) return -1;
|
||||
if (hex_add_overflow(t0, t1, &total)) return -1;
|
||||
if (hex_add_overflow(total, t2, &total)) return -1;
|
||||
if (hex_add_overflow(total, overhead, &total)) return -1;
|
||||
|
||||
*m_chunk_out = best_m;
|
||||
*n_chunk_out = best_n;
|
||||
*total_out = total;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// --- Tile Size Helpers ---
|
||||
static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) {
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q4_0;
|
||||
case HTP_TYPE_Q4_1:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q4_1;
|
||||
case HTP_TYPE_Q8_0:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_Q8_0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
return HTP_MM_WEIGHT_TILE_SIZE_MXFP4;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) {
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0;
|
||||
case HTP_TYPE_Q4_1:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1;
|
||||
case HTP_TYPE_Q8_0:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0;
|
||||
case HTP_TYPE_MXFP4:
|
||||
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Activation/Row Size Helpers ---
|
||||
static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) {
|
||||
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
|
||||
const uint32_t nb_32 = ne_padded / 32;
|
||||
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) {
|
||||
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
|
||||
const uint32_t nb_32 = ne_padded / 32;
|
||||
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) {
|
||||
const uint32_t quants_size = hex_align_up(ne, 128);
|
||||
const uint32_t num_scales = (ne + 31) / 32;
|
||||
const uint32_t scales_size = hex_align_up(num_scales * 2, 128);
|
||||
return quants_size + scales_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) {
|
||||
const uint32_t quants_size = hex_align_up(ne, 128);
|
||||
const uint32_t num_scales = (ne + 31) / 32;
|
||||
const uint32_t scales_size = hex_align_up(num_scales * 4, 128);
|
||||
return quants_size + scales_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) {
|
||||
uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED;
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
case HTP_TYPE_Q4_1:
|
||||
case HTP_TYPE_Q8_0:
|
||||
case HTP_TYPE_MXFP4:
|
||||
return (size_t) nb * htp_mm_get_weight_tile_size(weight_type);
|
||||
case HTP_TYPE_F16:
|
||||
return (size_t) k * sizeof(__fp16);
|
||||
case HTP_TYPE_F32:
|
||||
return (size_t) k * sizeof(float);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_round_up(size_t n, size_t m) {
|
||||
return ((n + m - 1) / m) * m;
|
||||
}
|
||||
|
||||
static inline bool htp_mm_hmx_pipeline(uint32_t m) {
|
||||
return m > 32;
|
||||
}
|
||||
|
||||
static inline void htp_mm_hmx_get_2d_chunk_costs(
|
||||
int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size,
|
||||
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
|
||||
) {
|
||||
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
|
||||
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
|
||||
const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0;
|
||||
|
||||
*size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) +
|
||||
(pipeline ? 2 * vec_dot_size : vec_dot_size);
|
||||
*size_per_m_out = vec_dot_size;
|
||||
*size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t);
|
||||
}
|
||||
|
||||
static inline void htp_mm_hmx_get_batched_chunk_costs(
|
||||
uint32_t k, uint32_t group_size,
|
||||
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
|
||||
) {
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
*size_per_n_out = 3 * vec_dot_size;
|
||||
*size_per_m_out = group_size * vec_dot_size;
|
||||
*size_per_mn_out = sizeof(uint16_t);
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hmx_get_2d_vtcm_size(
|
||||
int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size
|
||||
) {
|
||||
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
|
||||
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
|
||||
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
|
||||
const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE);
|
||||
size_t weight_area_size = is_quant
|
||||
? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE)
|
||||
: htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE);
|
||||
if (pipeline) {
|
||||
weight_area_size *= 2;
|
||||
}
|
||||
const size_t act_area_size = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
|
||||
size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
size_t scratch1_size = pipeline ? scratch0_size : 0;
|
||||
size_t scratch2_size = pipeline ? output_area_size : 0;
|
||||
|
||||
return weight_area_size + act_area_size + act_f32_size + output_area_size +
|
||||
scratch0_size + scratch1_size + scratch2_size + 256;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hmx_get_batched_vtcm_size(
|
||||
int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) {
|
||||
(void)wtype;
|
||||
(void)pipeline;
|
||||
const size_t vec_dot_size = k * sizeof(uint16_t);
|
||||
const size_t f32_scratch_size = use_dma_activation
|
||||
? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0;
|
||||
|
||||
const size_t act_head_stride = mc * k;
|
||||
const size_t weight_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t act_area_size = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t output_area_size = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
|
||||
const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
|
||||
|
||||
return weight_area_size + act_area_size + output_area_size +
|
||||
2 * scratch_area_size + 256 + f32_scratch_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hvx_get_vtcm_sizes(
|
||||
int kernel_type,
|
||||
int wtype,
|
||||
uint32_t ne10, // k
|
||||
uint32_t src1_nrows, // m_total (or act_nrows)
|
||||
uint32_t n_threads,
|
||||
size_t dst_row_size,
|
||||
size_t src0_row_size,
|
||||
size_t src1_row_size,
|
||||
uint32_t n_prefetch,
|
||||
size_t * vtcm_src0_size_out,
|
||||
size_t * vtcm_src1_size_out,
|
||||
size_t * vtcm_dst_size_out
|
||||
) {
|
||||
size_t vtcm_src0_size = 0;
|
||||
size_t vtcm_src1_size = 0;
|
||||
size_t vtcm_dst_size = 0;
|
||||
|
||||
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
|
||||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
|
||||
wtype == HTP_TYPE_MXFP4);
|
||||
|
||||
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
|
||||
const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1;
|
||||
|
||||
switch (kernel_type) {
|
||||
case HTP_MM_KERNEL_HVX_F16_F16_VTCM: {
|
||||
size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128);
|
||||
vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256);
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_F16_F32_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F16_F16_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F32_F32_DDR:
|
||||
case HTP_MM_KERNEL_HVX_F32_F16_DDR: {
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads;
|
||||
vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_F32_F32_VTCM: {
|
||||
size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128);
|
||||
vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256);
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_QUANT_BLOCK:
|
||||
case HTP_MM_KERNEL_HVX_QUANT_ROW: {
|
||||
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10);
|
||||
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
|
||||
|
||||
// src0 spad is also used in dynamic quantizer to store padded src1 rows
|
||||
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float));
|
||||
if (vtcm_src0_size < src1_row_size_padded) {
|
||||
vtcm_src0_size = src1_row_size_padded;
|
||||
}
|
||||
|
||||
vtcm_src0_size = vtcm_src0_size * n_threads;
|
||||
vtcm_dst_size = vtcm_dst_size * n_threads;
|
||||
|
||||
if (is_repack) {
|
||||
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
uint32_t n_k_tiles = ne10 / 32;
|
||||
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
vtcm_src0_size = repacked_vtcm_size * n_threads;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: {
|
||||
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10);
|
||||
|
||||
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
|
||||
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
|
||||
|
||||
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256);
|
||||
if (vtcm_src0_size < src1_row_size_padded) {
|
||||
vtcm_src0_size = src1_row_size_padded;
|
||||
}
|
||||
|
||||
vtcm_src0_size = vtcm_src0_size * n_threads;
|
||||
vtcm_dst_size = vtcm_dst_size * n_threads;
|
||||
|
||||
if (is_repack) {
|
||||
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
uint32_t n_k_tiles = ne10 / 32;
|
||||
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
vtcm_src0_size = repacked_vtcm_size * n_threads;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
*vtcm_src0_size_out = vtcm_src0_size;
|
||||
*vtcm_src1_size_out = vtcm_src1_size;
|
||||
*vtcm_dst_size_out = vtcm_dst_size;
|
||||
|
||||
return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size;
|
||||
}
|
||||
|
||||
static inline size_t htp_mm_hvx_id_get_vtcm_sizes(
|
||||
int wtype,
|
||||
uint32_t ne10, // k
|
||||
uint32_t src1_nrows,
|
||||
uint32_t n_threads,
|
||||
size_t src0_row_size, // nb01
|
||||
uint32_t n_prefetch,
|
||||
size_t * vtcm_src0_size_out,
|
||||
size_t * vtcm_src1_size_out
|
||||
) {
|
||||
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
|
||||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
|
||||
wtype == HTP_TYPE_MXFP4);
|
||||
|
||||
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
|
||||
const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10)
|
||||
: htp_mm_q8_0_tiled_row_size(ne10);
|
||||
|
||||
size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
|
||||
size_t src1_sz = htp_mm_round_up(src1_row_size * src1_nrows, 256);
|
||||
|
||||
// src0 spad also holds temporary transposed src1 columns during dynamic quantization.
|
||||
const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float));
|
||||
if (src0_sz_per_thread < src1_row_size_padded) {
|
||||
src0_sz_per_thread = src1_row_size_padded;
|
||||
}
|
||||
|
||||
if (is_repack) {
|
||||
const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
|
||||
const uint32_t n_k_tiles = ne10 / 32;
|
||||
const uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
|
||||
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
|
||||
if (repacked_vtcm_size < src1_row_size_padded) {
|
||||
repacked_vtcm_size = src1_row_size_padded;
|
||||
}
|
||||
src0_sz_per_thread = repacked_vtcm_size;
|
||||
}
|
||||
|
||||
const size_t vtcm_src0_size = src0_sz_per_thread * n_threads;
|
||||
|
||||
*vtcm_src0_size_out = vtcm_src0_size;
|
||||
*vtcm_src1_size_out = src1_sz;
|
||||
|
||||
return vtcm_src0_size + src1_sz;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HTP_MATMUL_OPS_H
|
||||
@@ -14,8 +14,6 @@ Drivers_Dir = 13
|
||||
1 = %DiskId%
|
||||
|
||||
[SourceDisksFiles]
|
||||
libggml-htp-v68.so = 1
|
||||
libggml-htp-v69.so = 1
|
||||
libggml-htp-v73.so = 1
|
||||
libggml-htp-v75.so = 1
|
||||
libggml-htp-v79.so = 1
|
||||
@@ -28,8 +26,6 @@ ExcludeFromSelect = *
|
||||
CopyFiles=Drivers_Dir
|
||||
|
||||
[Drivers_Dir]
|
||||
libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
|
||||
|
||||
@@ -10152,14 +10152,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
@@ -10173,11 +10167,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
@@ -24,6 +24,7 @@ kernel void kernel_norm(
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
@@ -43,7 +44,8 @@ kernel void kernel_norm(
|
||||
// parallel sum
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
sum[get_local_id(0)] += x[i00];
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
sum[get_local_id(0)] += x[i00*nb00/4];
|
||||
}
|
||||
// reduce
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -60,7 +62,8 @@ kernel void kernel_norm(
|
||||
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] - mean;
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
y[i00] = x[i00*nb00/4] - mean;
|
||||
sum[get_local_id(0)] += y[i00] * y[i00];
|
||||
}
|
||||
|
||||
|
||||
@@ -493,6 +493,20 @@ struct vk_conv2d_pipeline_state {
|
||||
}
|
||||
};
|
||||
|
||||
struct vk_conv3d_pipeline_state {
|
||||
vk_conv3d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t s2, uint32_t p0, uint32_t p1, uint32_t p2,
|
||||
uint32_t d0, uint32_t d1, uint32_t d2, uint32_t KW, uint32_t KH, uint32_t KD, uint32_t aligned)
|
||||
: s0(s0), s1(s1), s2(s2), p0(p0), p1(p1), p2(p2), d0(d0), d1(d1), d2(d2), KW(KW), KH(KH), KD(KD), aligned(aligned) {}
|
||||
|
||||
uint32_t s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD;
|
||||
uint32_t aligned;
|
||||
|
||||
bool operator<(const vk_conv3d_pipeline_state &b) const {
|
||||
return std::tie(s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD, aligned) <
|
||||
std::tie(b.s0, b.s1, b.s2, b.p0, b.p1, b.p2, b.d0, b.d1, b.d2, b.KW, b.KH, b.KD, b.aligned);
|
||||
}
|
||||
};
|
||||
|
||||
struct vk_solve_tri_pipeline_state {
|
||||
vk_solve_tri_pipeline_state(uint32_t N, uint32_t K)
|
||||
: N(N), K(K) {}
|
||||
@@ -685,6 +699,7 @@ struct vk_device_struct {
|
||||
|
||||
bool add_rms_fusion;
|
||||
uint32_t partials_binding_alignment;
|
||||
uint32_t max_nodes_per_submit;
|
||||
|
||||
bool shader_64b_indexing;
|
||||
|
||||
@@ -777,6 +792,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
||||
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_get_rows_back_f32;
|
||||
vk_pipeline pipeline_acc_f32;
|
||||
vk_pipeline pipeline_set_f32;
|
||||
|
||||
@@ -801,14 +817,10 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64;
|
||||
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32;
|
||||
vk_pipeline pipeline_scale_f32;
|
||||
vk_pipeline pipeline_sqr_f32;
|
||||
vk_pipeline pipeline_sqrt_f32;
|
||||
vk_pipeline pipeline_sin_f32;
|
||||
vk_pipeline pipeline_cos_f32;
|
||||
vk_pipeline pipeline_log[2];
|
||||
vk_pipeline pipeline_tri[2];
|
||||
vk_pipeline pipeline_diag[2];
|
||||
vk_pipeline pipeline_clamp_f32;
|
||||
vk_pipeline pipeline_clamp[2];
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_roll_f32;
|
||||
vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32;
|
||||
@@ -840,6 +852,10 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_gelu_quick[2];
|
||||
vk_pipeline pipeline_silu[2];
|
||||
vk_pipeline pipeline_relu[2];
|
||||
vk_pipeline pipeline_sqr[2];
|
||||
vk_pipeline pipeline_sqrt[2];
|
||||
vk_pipeline pipeline_sin[2];
|
||||
vk_pipeline pipeline_cos[2];
|
||||
vk_pipeline pipeline_xielu[2];
|
||||
vk_pipeline pipeline_neg[2];
|
||||
vk_pipeline pipeline_tanh[2];
|
||||
@@ -871,7 +887,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_geglu_erf[2];
|
||||
vk_pipeline pipeline_geglu_quick[2];
|
||||
|
||||
vk_pipeline pipeline_leaky_relu_f32;
|
||||
vk_pipeline pipeline_leaky_relu[2];
|
||||
vk_pipeline pipeline_silu_back_f32;
|
||||
vk_pipeline pipeline_diag_mask_inf_f32;
|
||||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||
@@ -924,6 +940,8 @@ struct vk_device_struct {
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> pipeline_conv_transpose_2d_f32[CONV_SHAPE_COUNT];
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> pipeline_conv_transpose_2d_f16_f32[CONV_SHAPE_COUNT];
|
||||
std::map<vk_conv3d_pipeline_state, vk_pipeline> pipeline_conv3d_f32[CONV_SHAPE_COUNT];
|
||||
std::map<vk_conv3d_pipeline_state, vk_pipeline> pipeline_conv3d_f16_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32;
|
||||
|
||||
@@ -1669,6 +1687,41 @@ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
|
||||
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
|
||||
}
|
||||
|
||||
struct vk_op_conv3d_push_constants {
|
||||
uint32_t OC;
|
||||
uint32_t IC;
|
||||
uint32_t N;
|
||||
|
||||
uint32_t IW;
|
||||
uint32_t IH;
|
||||
uint32_t ID;
|
||||
uint32_t OW;
|
||||
uint32_t OH;
|
||||
uint32_t OD;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
|
||||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
|
||||
uint32_t OWmp; uint32_t OWL;
|
||||
uint32_t OWOHmp; uint32_t OWOHL;
|
||||
uint32_t OWOHODmp; uint32_t OWOHODL;
|
||||
};
|
||||
|
||||
template <> void init_pushconst_fastdiv(vk_op_conv3d_push_constants &p) {
|
||||
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
|
||||
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
|
||||
init_fastdiv_values(p.OW*p.OH*p.OD, p.OWOHODmp, p.OWOHODL);
|
||||
}
|
||||
|
||||
struct vk_op_conv2d_dw_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t batches;
|
||||
@@ -4074,19 +4127,35 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
#endif
|
||||
|
||||
auto const &ggml_vk_mul_mm_spec = [](std::vector<uint32_t> spec, bool aligned) {
|
||||
spec.push_back(aligned ? 1u : 0u);
|
||||
return spec;
|
||||
};
|
||||
|
||||
const int mul_mat_id_param_count = 5;
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat2) {
|
||||
auto const &ggml_vk_mul_mm_cm2_spec = [](std::vector<uint32_t> spec, bool aligned, bool mul_mat_id) {
|
||||
if (mul_mat_id && spec.size() > 5) {
|
||||
spec.insert(spec.begin() + 5, aligned ? 1u : 0u);
|
||||
} else {
|
||||
spec.push_back(aligned ? 1u : 0u);
|
||||
}
|
||||
if (mul_mat_id && spec.size() == 6) {
|
||||
spec.push_back(32);
|
||||
}
|
||||
return spec;
|
||||
};
|
||||
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(l_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(m_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(s_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(l_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), l_align, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(m_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), m_align, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(s_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), s_align, true); \
|
||||
|
||||
// Create 2 variants, {f16,f32} accumulator
|
||||
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
|
||||
@@ -4161,17 +4230,17 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, true); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, true); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, true); \
|
||||
|
||||
// Create 2 variants, {f16,f32} accumulator
|
||||
#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
@@ -4284,32 +4353,32 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
// Selects dot2 SPIR-V variant at runtime when device->dot2_f16 is true
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
|
||||
// bf16 scalar path promotes to f32, no dot2 variant
|
||||
#define CREATE_MM_NODOT2(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
|
||||
#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l_int[TYPE]) { \
|
||||
@@ -4474,17 +4543,17 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _l[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _m[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
if (device->mul_mat ## ID ## _s[TYPE]) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \
|
||||
|
||||
#define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l_int[TYPE]) \
|
||||
@@ -4879,6 +4948,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_back_f32, "get_rows_back_f32", get_rows_back_f32_len, get_rows_back_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||
@@ -4903,7 +4973,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true);
|
||||
@@ -5023,11 +5093,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -5037,8 +5102,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -5058,6 +5121,12 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_UNARY(gelu_quick)
|
||||
CREATE_UNARY(silu)
|
||||
CREATE_UNARY(relu)
|
||||
CREATE_UNARY(sqr)
|
||||
CREATE_UNARY(sqrt)
|
||||
CREATE_UNARY(sin)
|
||||
CREATE_UNARY(cos)
|
||||
CREATE_UNARY(clamp)
|
||||
CREATE_UNARY(leaky_relu)
|
||||
CREATE_UNARY(xielu)
|
||||
CREATE_UNARY(neg)
|
||||
CREATE_UNARY(tanh)
|
||||
@@ -5097,7 +5166,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
CREATE_GLU(geglu_quick)
|
||||
#undef CREATE_GLU
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
|
||||
@@ -5314,7 +5382,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
// conv2d, conv_transpose_2d
|
||||
// conv2d, conv_transpose_2d, conv3d
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
// smaller WG for the small-tile fallback gives more concurrent WGs per SM
|
||||
uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256;
|
||||
@@ -5377,8 +5445,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size;
|
||||
};
|
||||
|
||||
// coopmat1 needs to store the output through shared memory, so check up front
|
||||
// whether it'll fit and disable it before applying coopmat1 parameters.
|
||||
// 2D, transpose-2D, and 3D conv use the same KxCRS @ CRSxNPQ shmem
|
||||
// layout. cm1 needs Csh for output, so check before applying cm1 params.
|
||||
if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) {
|
||||
conv2d_use_cm1 = false;
|
||||
}
|
||||
@@ -5470,6 +5538,53 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
#undef CREATE_CONV
|
||||
#undef CREATE_CONVS
|
||||
|
||||
std::vector<uint32_t> conv3d_spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, conv2d_SHMEM_PAD };
|
||||
#define CREATE_CONV3D(type_suffix, spv_suffix) \
|
||||
for (auto &c : device->pipeline_conv3d##type_suffix[s]) { \
|
||||
const vk_conv3d_pipeline_state &state = c.first; \
|
||||
std::vector<uint32_t> spec_constants_cpy = conv3d_spec_constants; \
|
||||
spec_constants_cpy.push_back(state.s0); \
|
||||
spec_constants_cpy.push_back(state.s1); \
|
||||
spec_constants_cpy.push_back(state.s2); \
|
||||
spec_constants_cpy.push_back(state.p0); \
|
||||
spec_constants_cpy.push_back(state.p1); \
|
||||
spec_constants_cpy.push_back(state.p2); \
|
||||
spec_constants_cpy.push_back(state.d0); \
|
||||
spec_constants_cpy.push_back(state.d1); \
|
||||
spec_constants_cpy.push_back(state.d2); \
|
||||
spec_constants_cpy.push_back(state.KW); \
|
||||
spec_constants_cpy.push_back(state.KH); \
|
||||
spec_constants_cpy.push_back(state.KD); \
|
||||
spec_constants_cpy.push_back(state.aligned); \
|
||||
spec_constants_cpy.push_back(conv2d_csh_store); \
|
||||
spec_constants_cpy.push_back(conv2d_WM); \
|
||||
spec_constants_cpy.push_back(conv2d_WN); \
|
||||
ggml_vk_create_pipeline( \
|
||||
device, c.second, "conv3d" #type_suffix, \
|
||||
conv3d##type_suffix##spv_suffix##_len, conv3d##type_suffix##spv_suffix##_data, "main", 3, \
|
||||
sizeof(vk_op_conv3d_push_constants), wg_denoms, spec_constants_cpy, 1, true, conv2d_required_subgroup_size != 0, conv2d_required_subgroup_size); \
|
||||
}
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat2) {
|
||||
CREATE_CONV3D(_f32, _cm2)
|
||||
CREATE_CONV3D(_f16_f32, _cm2)
|
||||
} else
|
||||
#endif
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (conv2d_use_cm1) {
|
||||
CREATE_CONV3D(_f32, _cm1)
|
||||
CREATE_CONV3D(_f16_f32, _cm1)
|
||||
} else
|
||||
#endif
|
||||
if (conv2d_UNROLL) {
|
||||
CREATE_CONV3D(_f32, _unroll)
|
||||
CREATE_CONV3D(_f16_f32, _unroll)
|
||||
} else {
|
||||
CREATE_CONV3D(_f32, )
|
||||
CREATE_CONV3D(_f16_f32, )
|
||||
}
|
||||
#undef CREATE_CONV3D
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -5764,6 +5879,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote);
|
||||
|
||||
// Submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
device->max_nodes_per_submit = 100;
|
||||
const char* GGML_VK_MAX_NODES_PER_SUBMIT = getenv("GGML_VK_MAX_NODES_PER_SUBMIT");
|
||||
if (GGML_VK_MAX_NODES_PER_SUBMIT != nullptr) {
|
||||
uint32_t max_nodes_per_submit = std::stoul(GGML_VK_MAX_NODES_PER_SUBMIT);
|
||||
device->max_nodes_per_submit = std::max(max_nodes_per_submit, 1u);
|
||||
}
|
||||
|
||||
const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
|
||||
|
||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||
@@ -10294,6 +10417,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_get_rows_f32[src0->type];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_get_rows_back_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ACC:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_acc_f32;
|
||||
@@ -10400,23 +10528,27 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_SQR:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_sqr_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_sqr[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_SQRT:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_sqrt_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_sqrt[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_SIN:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_sin_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_sin[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_COS:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_cos_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_cos[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LOG:
|
||||
@@ -10438,8 +10570,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CLAMP:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_clamp_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_clamp[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_PAD:
|
||||
@@ -10807,8 +10940,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_leaky_relu[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D:
|
||||
@@ -10885,6 +11019,61 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_3D:
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
const uint32_t OC = (uint32_t)ggml_get_op_params_i32(dst, 11);
|
||||
const uint32_t IC = (uint32_t)ggml_get_op_params_i32(dst, 9);
|
||||
const uint32_t N = (uint32_t)ggml_get_op_params_i32(dst, 10);
|
||||
const uint32_t NPQ = N * dst->ne[2] * dst->ne[1] * dst->ne[0];
|
||||
const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, OC, NPQ);
|
||||
|
||||
const uint32_t KW = (uint32_t)src0->ne[0];
|
||||
const uint32_t KH = (uint32_t)src0->ne[1];
|
||||
const uint32_t KD = (uint32_t)src0->ne[2];
|
||||
const uint32_t s0 = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
||||
const uint32_t s1 = (uint32_t)ggml_get_op_params_i32(dst, 1);
|
||||
const uint32_t s2 = (uint32_t)ggml_get_op_params_i32(dst, 2);
|
||||
const uint32_t p0 = (uint32_t)ggml_get_op_params_i32(dst, 3);
|
||||
const uint32_t p1 = (uint32_t)ggml_get_op_params_i32(dst, 4);
|
||||
const uint32_t p2 = (uint32_t)ggml_get_op_params_i32(dst, 5);
|
||||
const uint32_t d0 = (uint32_t)ggml_get_op_params_i32(dst, 6);
|
||||
const uint32_t d1 = (uint32_t)ggml_get_op_params_i32(dst, 7);
|
||||
const uint32_t d2 = (uint32_t)ggml_get_op_params_i32(dst, 8);
|
||||
|
||||
const uint32_t CRS = IC * KW * KH * KD;
|
||||
const uint32_t BS_K = vk_conv_block_sizes[shape].K;
|
||||
const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
|
||||
const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
|
||||
const uint32_t aligned = ((OC % BS_K == 0) &&
|
||||
(CRS % BS_CRS == 0) &&
|
||||
(NPQ % BS_NPQ == 0)) ? 1u : 0u;
|
||||
|
||||
vk_conv3d_pipeline_state conv3d_pipeline_state(s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD, aligned);
|
||||
|
||||
std::map<vk_conv3d_pipeline_state, vk_pipeline> *pipelines = nullptr;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
pipelines = &ctx->device->pipeline_conv3d_f32[shape];
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
pipelines = &ctx->device->pipeline_conv3d_f16_f32[shape];
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
|
||||
auto it = pipelines->find(conv3d_pipeline_state);
|
||||
if (it != pipelines->end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
(*pipelines)[conv3d_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
}
|
||||
}
|
||||
|
||||
return pipeline;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ADD1:
|
||||
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_add1_f16_f16;
|
||||
@@ -11135,6 +11324,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
break;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
elements = { (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], 1 };
|
||||
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
GGML_ASSERT(0);
|
||||
break;
|
||||
@@ -11220,6 +11413,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
GGML_ABORT("invalid push constant type for CONV_2D");
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CONV_3D:
|
||||
if constexpr (std::is_same_v<PC, vk_op_conv3d_push_constants>) {
|
||||
const uint32_t NPQ = pc.N * pc.OD * pc.OH * pc.OW;
|
||||
const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.OC, NPQ);
|
||||
const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ);
|
||||
|
||||
elements = { pc.OC, NPQ_blocks, 1 };
|
||||
if (elements[1] > 512) {
|
||||
elements[2] = CEIL_DIV(elements[1], 512);
|
||||
elements[1] = 512;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("invalid push constant type for CONV_3D");
|
||||
}
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_DIV:
|
||||
@@ -11236,6 +11444,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ROLL:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -11380,6 +11589,21 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_get_rows_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS_BACK, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2], (uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f, 0,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
@@ -12087,8 +12311,10 @@ static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
|
||||
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
||||
p.param1 = op_params[0];
|
||||
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
@@ -13118,6 +13344,51 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_3d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
vk_op_conv3d_push_constants p{};
|
||||
p.IC = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 9));
|
||||
p.N = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 10));
|
||||
p.OC = static_cast<uint32_t>(ggml_get_op_params_i32(dst, 11));
|
||||
GGML_ASSERT(src0->ne[3] == (int64_t)p.IC * p.OC);
|
||||
GGML_ASSERT(src1->ne[3] == (int64_t)p.IC * p.N);
|
||||
GGML_ASSERT(dst->ne[3] == (int64_t)p.OC * p.N);
|
||||
|
||||
p.IW = static_cast<uint32_t>(ne10);
|
||||
p.IH = static_cast<uint32_t>(ne11);
|
||||
p.ID = static_cast<uint32_t>(ne12);
|
||||
p.OW = static_cast<uint32_t>(ne0);
|
||||
p.OH = static_cast<uint32_t>(ne1);
|
||||
p.OD = static_cast<uint32_t>(ne2);
|
||||
|
||||
// the shader clamps src addresses to p.IC * p.N * p.IW * p.IH * p.ID - 1 in uint32, so the
|
||||
// total input element count must fit in a uint32.
|
||||
GGML_ASSERT((uint64_t)p.IC * p.N * p.IW * p.IH * p.ID <= 0xFFFFFFFFull);
|
||||
|
||||
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
||||
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
||||
p.nb03 = static_cast<uint32_t>(nb03 / nb00);
|
||||
|
||||
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
||||
p.nb12 = static_cast<uint32_t>(nb12 / nb10);
|
||||
p.nb13 = static_cast<uint32_t>(nb13 / nb10);
|
||||
|
||||
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
||||
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
|
||||
p.nb3 = static_cast<uint32_t>(nb3 / nb0);
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_3D, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
vk_op_conv2d_dw_push_constants p{};
|
||||
p.ne = ggml_nelements(dst);
|
||||
@@ -13144,7 +13415,10 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
const float * op_params = (const float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f });
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
||||
p.param1 = op_params[0];
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, std::move(p));
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_RUN_TESTS
|
||||
@@ -14247,6 +14521,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
ggml_vk_get_rows_back(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
if (ctx->num_additional_fused_ops) {
|
||||
@@ -14515,6 +14793,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_3D:
|
||||
ggml_vk_conv_3d(ctx, compute_ctx, src0, src1, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node);
|
||||
@@ -15900,8 +16182,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
int nodes_per_submit = 100;
|
||||
int submitted_nodes = 0;
|
||||
int submit_count = 0;
|
||||
uint64_t mul_mat_bytes = 0;
|
||||
@@ -16127,7 +16407,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||
bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
|
||||
(mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||
(i + ctx->num_additional_fused_ops >= last_node) ||
|
||||
(almost_ready && !ctx->almost_ready_fence_pending);
|
||||
@@ -16964,6 +17244,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
switch (op->type) {
|
||||
@@ -17060,12 +17342,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_is_contiguous_rows(op->src[0]) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
@@ -17084,8 +17365,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
op->type == op->src[0]->type;
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
@@ -17285,6 +17567,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
ggml_is_contiguous(op->src[1]) &&
|
||||
ggml_is_contiguous(op));
|
||||
}
|
||||
case GGML_OP_CONV_3D:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
op->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]) &&
|
||||
ggml_is_contiguous(op);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -18128,6 +18417,20 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
|
||||
} else if (tensor->op == GGML_OP_CONV_3D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s2 = tensor->op_params[2];
|
||||
const int32_t p0 = tensor->op_params[3];
|
||||
const int32_t p1 = tensor->op_params[4];
|
||||
const int32_t p2 = tensor->op_params[5];
|
||||
const int32_t d0 = tensor->op_params[6];
|
||||
const int32_t d1 = tensor->op_params[7];
|
||||
const int32_t d2 = tensor->op_params[8];
|
||||
const int32_t IC = tensor->op_params[9];
|
||||
const int32_t N = tensor->op_params[10];
|
||||
const int32_t OC = tensor->op_params[11];
|
||||
tensor_clone = ggml_conv_3d_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, s2, p0, p1, p2, d0, d1, d2, IC, N, OC);
|
||||
} else if (tensor->op == GGML_OP_CONV_2D_DW) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
}
|
||||
@@ -0,0 +1,431 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#ifdef COOPMAT2
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
|
||||
layout(binding = 0) readonly buffer A {
|
||||
A_TYPE knl_data[];
|
||||
}; // src0 - kernel: [KW, KH, KD, IC*OC]
|
||||
|
||||
layout(binding = 1) readonly buffer B {
|
||||
B_TYPE src_data[];
|
||||
}; // src1 - input: [IW, IH, ID, IC*N] -- channel_first format
|
||||
|
||||
layout(binding = 2) writeonly buffer D {
|
||||
D_TYPE dst_data[];
|
||||
}; // dst - result: [OW, OH, OD, OC*N]
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
// I/O channels, batch size
|
||||
uint32_t OC;
|
||||
uint32_t IC;
|
||||
uint32_t N;
|
||||
|
||||
// Tensor spatial sizes: input, output
|
||||
uint32_t IW;
|
||||
uint32_t IH;
|
||||
uint32_t ID;
|
||||
uint32_t OW;
|
||||
uint32_t OH;
|
||||
uint32_t OD;
|
||||
|
||||
// Strides in elements
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
|
||||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
|
||||
// fastdiv helper values
|
||||
uint32_t OWmp; uint32_t OWL;
|
||||
uint32_t OWOHmp; uint32_t OWOHL;
|
||||
uint32_t OWOHODmp; uint32_t OWOHODL;
|
||||
}
|
||||
|
||||
p;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
// Blocktile sizes
|
||||
layout(constant_id = 1) const uint BS_K = 128;
|
||||
layout(constant_id = 2) const uint BS_CRS = 16;
|
||||
layout(constant_id = 3) const uint BS_NPQ = 128;
|
||||
// Thread-tile sizes
|
||||
layout(constant_id = 4) const uint TS_K = 8;
|
||||
layout(constant_id = 5) const uint SHMEM_PAD = 4;
|
||||
// Stride, padding, dilation
|
||||
layout(constant_id = 6) const uint s0 = 1;
|
||||
layout(constant_id = 7) const uint s1 = 1;
|
||||
layout(constant_id = 8) const uint s2 = 1;
|
||||
layout(constant_id = 9) const uint p0 = 0;
|
||||
layout(constant_id = 10) const uint p1 = 0;
|
||||
layout(constant_id = 11) const uint p2 = 0;
|
||||
layout(constant_id = 12) const uint d0 = 1;
|
||||
layout(constant_id = 13) const uint d1 = 1;
|
||||
layout(constant_id = 14) const uint d2 = 1;
|
||||
// Kernel spatial sizes
|
||||
layout(constant_id = 15) const uint KW = 1;
|
||||
layout(constant_id = 16) const uint KH = 1;
|
||||
layout(constant_id = 17) const uint KD = 1;
|
||||
// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
|
||||
layout(constant_id = 18) const uint aligned = 0;
|
||||
// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
|
||||
layout(constant_id = 19) const uint csh_store = 0;
|
||||
|
||||
#ifdef COOPMAT
|
||||
// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
|
||||
// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
|
||||
// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
|
||||
layout(constant_id = 20) const uint WM = 32;
|
||||
layout(constant_id = 21) const uint WN = 32;
|
||||
const uint TM = 16;
|
||||
const uint TN = 16;
|
||||
const uint TK = 16;
|
||||
const uint cms_per_row = WM / TM;
|
||||
const uint cms_per_col = WN / TN;
|
||||
const uint warps_M = BS_K / WM;
|
||||
const uint warps_N = BS_NPQ / WN;
|
||||
#endif
|
||||
|
||||
// without padding, ID_idx/IH_idx/IW_idx are in bounds by construction
|
||||
const bool dhw_in_bounds = (p0 == 0) && (p1 == 0) && (p2 == 0);
|
||||
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||
|
||||
uint splitWork(uint work_size, uint block_size) {
|
||||
return (block_size + work_size - 1) / block_size;
|
||||
}
|
||||
|
||||
uint32_t K = p.OC;
|
||||
uint32_t CRS = p.IC * KD * KH * KW;
|
||||
uint32_t NPQ = p.N * p.OD * p.OH * p.OW;
|
||||
|
||||
// Number of blocktiles per input
|
||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
#define SHMEM_TYPE float16_t
|
||||
#else
|
||||
#define SHMEM_TYPE float
|
||||
#endif
|
||||
|
||||
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
|
||||
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
|
||||
|
||||
const uint32_t Ash_len = BS_K * Ash_stride;
|
||||
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||
|
||||
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
|
||||
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
|
||||
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC through shmem so global stores are row-major (NPQ-contiguous)
|
||||
const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
// Threadtile sizes
|
||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||
|
||||
// Number of threadtiles per blocktile
|
||||
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
|
||||
|
||||
/*
|
||||
Compute
|
||||
KxCRS @ CRSxNPQ = K x NPQ
|
||||
K=OC
|
||||
C=IC
|
||||
D,R,S=KD,KH,KW
|
||||
Z,P,Q=OD,OH,OW
|
||||
*/
|
||||
|
||||
uint32_t B_idx_K = gl_WorkGroupID.x;
|
||||
uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512;
|
||||
|
||||
uint32_t T_y = tid / NT_NPQ;
|
||||
uint32_t T_x = tid % NT_NPQ;
|
||||
|
||||
uint32_t Ar = tid / BS_CRS;
|
||||
uint32_t Ac = tid % BS_CRS;
|
||||
const uint32_t ArpWg = WG_SIZE / BS_CRS;
|
||||
|
||||
uint32_t Br = tid / BS_NPQ;
|
||||
uint32_t Bc = tid % BS_NPQ;
|
||||
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
|
||||
|
||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||
uint fastdiv(uint n, uint mp, uint L) {
|
||||
uint msbs, lsbs;
|
||||
// msbs = mulhi(n, mp)
|
||||
umulExtended(n, mp, msbs, lsbs);
|
||||
return (msbs + n) >> L;
|
||||
}
|
||||
|
||||
void split_crs(uint32_t crs_idx, out uint32_t ic, out uint32_t kd, out uint32_t kh, out uint32_t kw) {
|
||||
const uint32_t KHKW = KH * KW;
|
||||
const uint32_t KDKHKW = KD * KHKW;
|
||||
ic = crs_idx / KDKHKW;
|
||||
uint32_t rem = crs_idx - ic * KDKHKW;
|
||||
kd = rem / KHKW;
|
||||
rem = rem - kd * KHKW;
|
||||
kh = rem / KW;
|
||||
kw = rem - kh * KW;
|
||||
}
|
||||
|
||||
void split_npq(uint32_t npq_idx, out uint32_t n, out uint32_t od, out uint32_t oh, out uint32_t ow) {
|
||||
const uint32_t OWOH = p.OW * p.OH;
|
||||
n = fastdiv(npq_idx, p.OWOHODmp, p.OWOHODL);
|
||||
uint32_t rem = npq_idx - n * p.OD * OWOH;
|
||||
od = fastdiv(rem, p.OWOHmp, p.OWOHL);
|
||||
rem = rem - od * OWOH;
|
||||
oh = fastdiv(rem, p.OWmp, p.OWL);
|
||||
ow = rem - oh * p.OW;
|
||||
}
|
||||
|
||||
#ifdef COOPMAT2
|
||||
#define ACC_TYPE float16_t
|
||||
|
||||
ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
|
||||
{
|
||||
uint32_t K_idx = B_idx_K * BS_K + r;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
|
||||
uint32_t N_idx;
|
||||
uint32_t OD_idx;
|
||||
uint32_t OH_idx;
|
||||
uint32_t OW_idx;
|
||||
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
if (B_idx_NPQ * BS_NPQ >= NPQ) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef COOPMAT2
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||
#elif defined(COOPMAT)
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
|
||||
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
|
||||
sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
|
||||
}
|
||||
const uint warp_r = gl_SubgroupID / warps_N;
|
||||
const uint warp_c = gl_SubgroupID % warps_N;
|
||||
#else
|
||||
float regC[TS_K][TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = 0.0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Advance block in CRS dim */
|
||||
[[dont_unroll]] for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
|
||||
uint32_t CRS_idx_a = B_idx_CRS * BS_CRS + Ac;
|
||||
uint32_t IC_idx_a;
|
||||
uint32_t KD_idx_a;
|
||||
uint32_t KH_idx_a;
|
||||
uint32_t KW_idx_a;
|
||||
split_crs(CRS_idx_a, IC_idx_a, KD_idx_a, KH_idx_a, KW_idx_a);
|
||||
|
||||
/* Load kernel to A_block: (BS_K x BS_CRS)*/
|
||||
UNROLL for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) {
|
||||
uint32_t B_ly = r_offset + Ar;
|
||||
uint32_t B_lx = Ac;
|
||||
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
|
||||
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + KD_idx_a * p.nb02 + (K_idx * p.IC + IC_idx_a) * p.nb03;
|
||||
if (aligned == 0) {
|
||||
knl_idx = min(knl_idx, K * CRS - 1);
|
||||
}
|
||||
float val = knl_data[knl_idx];
|
||||
if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
|
||||
val = 0.0;
|
||||
}
|
||||
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
|
||||
}
|
||||
/* Load input to B_block: (BS_CRS x BS_NPQ) */
|
||||
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
||||
uint32_t B_ly = r_offset + Br; /* Row index of B block */
|
||||
uint32_t B_lx = Bc;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
|
||||
uint32_t N_idx;
|
||||
uint32_t OD_idx;
|
||||
uint32_t OH_idx;
|
||||
uint32_t OW_idx;
|
||||
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
|
||||
|
||||
uint32_t CRS_idx_b = B_idx_CRS * BS_CRS + B_ly;
|
||||
uint32_t IC_idx_b;
|
||||
uint32_t KD_idx_b;
|
||||
uint32_t KH_idx_b;
|
||||
uint32_t KW_idx_b;
|
||||
split_crs(CRS_idx_b, IC_idx_b, KD_idx_b, KH_idx_b, KW_idx_b);
|
||||
|
||||
uint32_t ID_idx = OD_idx * s2 + KD_idx_b * d2 - p2;
|
||||
uint32_t IH_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
|
||||
uint32_t IW_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
|
||||
|
||||
uint32_t src_idx = IW_idx + IH_idx * p.nb11 + ID_idx * p.nb12 + (N_idx * p.IC + IC_idx_b) * p.nb13;
|
||||
// skip clamp when address can't go OOB
|
||||
if (aligned == 0 || !dhw_in_bounds) {
|
||||
src_idx = min(src_idx, p.IC * p.N * p.IW * p.IH * p.ID - 1);
|
||||
}
|
||||
float val = src_data[src_idx];
|
||||
bool oob = false;
|
||||
if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
|
||||
oob = true;
|
||||
}
|
||||
// also catches lower-bound underflow (idx wraps to 0x80000000+)
|
||||
if (!dhw_in_bounds && (ID_idx >= p.ID || IH_idx >= p.IH || IW_idx >= p.IW)) {
|
||||
oob = true;
|
||||
}
|
||||
if (oob) {
|
||||
val = 0.0;
|
||||
}
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
|
||||
}
|
||||
barrier();
|
||||
#ifdef COOPMAT2
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
|
||||
|
||||
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
matC = coopMatMulAdd(matA, matB, matC);
|
||||
#elif defined(COOPMAT)
|
||||
// each subgroup multiplies its grid of fragments per TK-sized CRS chunk
|
||||
[[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
|
||||
coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
|
||||
const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
float regA[TS_K];
|
||||
float regB[TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
||||
}
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
||||
}
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
barrier();
|
||||
}
|
||||
/* Save C* */
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
|
||||
#ifdef COOPMAT
|
||||
const bool use_staged_store = true;
|
||||
#else
|
||||
const bool use_staged_store = (csh_store != 0);
|
||||
#endif
|
||||
if (use_staged_store) {
|
||||
#ifdef COOPMAT
|
||||
// cm1: each subgroup stores its fragment grid into its Csh slot
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
}
|
||||
#else
|
||||
coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
#endif
|
||||
barrier();
|
||||
|
||||
// cooperative shmem->global: WG threads spread across BS_NPQ (the
|
||||
// contiguous direction of dst), each iter covers store_rows_per_iter K-rows
|
||||
const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
|
||||
const uint32_t store_iters = BS_K / store_rows_per_iter;
|
||||
const uint32_t k_thread_offset = tid / BS_NPQ;
|
||||
const uint32_t npq_thread = tid % BS_NPQ;
|
||||
[[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
|
||||
uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
|
||||
uint32_t K_idx = B_idx_K * BS_K + k_local;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
|
||||
uint32_t N_idx;
|
||||
uint32_t OD_idx;
|
||||
uint32_t OH_idx;
|
||||
uint32_t OW_idx;
|
||||
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef COOPMAT2
|
||||
else {
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
||||
uint32_t N_idx;
|
||||
uint32_t OD_idx;
|
||||
uint32_t OH_idx;
|
||||
uint32_t OW_idx;
|
||||
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(regC[T_ly][T_lx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
|
||||
}
|
||||
@@ -463,6 +463,7 @@ void main() {
|
||||
}
|
||||
rowmaxf = max(rowmaxf, float(Sf[r][c]));
|
||||
}
|
||||
rowmaxf += FATTN_KQ_MAX_OFFSET;
|
||||
float Moldf = Mf[r];
|
||||
|
||||
// M = max(rowmax, Mold)
|
||||
|
||||
@@ -352,6 +352,7 @@ void main() {
|
||||
}
|
||||
rowmaxf = max(rowmaxf, float(sfsh[r_vec + (c * cols_per_iter + col_tid) * sfshstride][r_comp]));
|
||||
}
|
||||
rowmaxf += FATTN_KQ_MAX_OFFSET;
|
||||
float Moldf = Mf[r];
|
||||
|
||||
// Compute max across the row
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_binary_head.glsl"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint col = gl_GlobalInvocationID.x;
|
||||
|
||||
if (col >= p.ne20) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint row = gl_GlobalInvocationID.y; row < p.ne21; row += gl_WorkGroupSize.y * gl_NumWorkGroups.y) {
|
||||
float sum = 0.0f;
|
||||
for (uint i = 0; i < p.ne10; ++i) {
|
||||
if (data_b[get_boffset() + i*p.nb10] == int(row)) {
|
||||
sum += data_a[get_aoffset() + i*p.nb01 + col*p.nb00];
|
||||
}
|
||||
}
|
||||
|
||||
data_d[get_doffset() + row*p.nb21 + col*p.nb20] = sum;
|
||||
}
|
||||
}
|
||||
@@ -14,16 +14,13 @@ void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
const uint i3 = row / (p.ne11 * p.ne12);
|
||||
const uint i3_offset = i3 * p.ne12 * p.ne11;
|
||||
const uint i2 = (row - i3_offset) / p.ne11;
|
||||
const uint i2_offset = i2 * p.ne11;
|
||||
const uint i1 = row - i3_offset - i2_offset;
|
||||
const uint a_base = get_aoffset() + src0_idx(row * p.ne00);
|
||||
const uint d_base = get_doffset() + dst_idx(row * p.ne10);
|
||||
|
||||
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]);
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_base + i0*p.nb00]);
|
||||
sum[tid] += xi * xi;
|
||||
}
|
||||
|
||||
@@ -39,6 +36,6 @@ void main() {
|
||||
const FLOAT_TYPE scale = 1.0f / max(sqrt(sum[0]), FLOAT_TYPE(p.param1));
|
||||
|
||||
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
||||
data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]));
|
||||
data_d[d_base + i0*p.nb10] = D_TYPE(scale * FLOAT_TYPE(data_a[a_base + i0*p.nb00]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float val = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
|
||||
}
|
||||
@@ -38,17 +38,7 @@
|
||||
#define LOAD_VEC_B 1
|
||||
#endif
|
||||
|
||||
// Load 2 values at once without affecting index calculations through LOAD_VEC
|
||||
#if (defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)) && !defined(ALIGNED)
|
||||
#define LOAD_VEC_BATCH_A 2
|
||||
#else
|
||||
#define LOAD_VEC_BATCH_A 1
|
||||
#endif
|
||||
#if !defined(ALIGNED)
|
||||
#define LOAD_VEC_BATCH_B 2
|
||||
#else
|
||||
#define LOAD_VEC_BATCH_B 1
|
||||
#endif
|
||||
layout (constant_id = 11) const uint ALIGNED = 0;
|
||||
|
||||
#if !defined(TO_FLOAT_TYPE)
|
||||
#define TO_FLOAT_TYPE FLOAT_TYPE
|
||||
@@ -57,6 +47,13 @@
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
#if defined(DATA_A_F32)
|
||||
layout (binding = 0) readonly buffer A_SCALAR {float data_a_scalar[];};
|
||||
#elif defined(DATA_A_F16)
|
||||
layout (binding = 0) readonly buffer A_SCALAR {float16_t data_a_scalar[];};
|
||||
#elif defined(DATA_A_BF16)
|
||||
layout (binding = 0) readonly buffer A_SCALAR {uint16_t data_a_scalar[];};
|
||||
#endif
|
||||
#if defined(A_TYPE_PACKED16)
|
||||
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
|
||||
#endif
|
||||
@@ -65,6 +62,7 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 1) readonly buffer B_SCALAR {B_TYPE_SCALAR data_b_scalar[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
@@ -194,13 +192,23 @@ void main() {
|
||||
const uint warp_r = warp_i % (BM / WM);
|
||||
const uint warp_c = warp_i / (BM / WM);
|
||||
|
||||
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
|
||||
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
|
||||
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
|
||||
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
|
||||
const uint LOAD_VEC_A_EFF = (ALIGNED != 0) ? LOAD_VEC_A : 1;
|
||||
const uint LOAD_VEC_BATCH_A = (ALIGNED != 0) ? 1 : 2;
|
||||
#else
|
||||
const uint LOAD_VEC_A_EFF = LOAD_VEC_A;
|
||||
const uint LOAD_VEC_BATCH_A = 1;
|
||||
#endif
|
||||
const uint LOAD_VEC_B_EFF = (ALIGNED != 0) ? LOAD_VEC_B : 1;
|
||||
const uint LOAD_VEC_BATCH_B = (ALIGNED != 0) ? 1 : 2;
|
||||
|
||||
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A * LOAD_VEC_BATCH_A / BK;
|
||||
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B * LOAD_VEC_BATCH_B / BK;
|
||||
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A);
|
||||
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A);
|
||||
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B);
|
||||
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B);
|
||||
|
||||
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A_EFF * LOAD_VEC_BATCH_A / BK;
|
||||
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B_EFF * LOAD_VEC_BATCH_B / BK;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
#ifdef MUL_MAT_ID_USE_SUBGROUPS
|
||||
@@ -239,15 +247,15 @@ void main() {
|
||||
|
||||
uint pos_a =
|
||||
#ifdef MUL_MAT_ID
|
||||
expert_idx * (p.batch_stride_a / LOAD_VEC_A) +
|
||||
expert_idx * (p.batch_stride_a / LOAD_VEC_A_EFF) +
|
||||
#else
|
||||
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) +
|
||||
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A_EFF) +
|
||||
#endif
|
||||
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
|
||||
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A_EFF;
|
||||
#ifdef MUL_MAT_ID
|
||||
uint pos_b = 0;
|
||||
#else
|
||||
uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B;
|
||||
uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B_EFF;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
@@ -287,8 +295,8 @@ void main() {
|
||||
|
||||
barrier();
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
pos_a += BK / LOAD_VEC_A_EFF;
|
||||
pos_b += BK / LOAD_VEC_B_EFF;
|
||||
|
||||
#ifdef COOPMAT
|
||||
[[unroll]] for (uint i = 0; i < BK; i += TK) {
|
||||
|
||||
@@ -36,6 +36,7 @@ layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working wit
|
||||
layout (constant_id = 4) const bool enable_smaller_matrices = false;
|
||||
const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN;
|
||||
const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN;
|
||||
layout (constant_id = 5) const uint ALIGNED = 0;
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
@@ -111,7 +112,7 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
|
||||
};
|
||||
|
||||
uint _ne1;
|
||||
layout (constant_id = 5) const uint subgroup_size = 32;
|
||||
layout (constant_id = 6) const uint subgroup_size = 32;
|
||||
shared uvec4 ballots_sh[BLOCK_SIZE / subgroup_size];
|
||||
|
||||
B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
@@ -297,12 +298,12 @@ void main() {
|
||||
|
||||
// Hint to the compiler that values are aligned (want 16B alignment).
|
||||
// Quants are always block-aligned, no alignment needed.
|
||||
#if ALIGNED
|
||||
if (ALIGNED != 0) {
|
||||
#if QUANT_K == 1
|
||||
stride_a &= ~7;
|
||||
#endif
|
||||
stride_b &= ~7;
|
||||
stride_a &= ~7;
|
||||
#endif
|
||||
stride_b &= ~7;
|
||||
}
|
||||
|
||||
// Create layouts for both clamped and unclamped accesses
|
||||
tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2);
|
||||
|
||||
@@ -1,50 +1,57 @@
|
||||
void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint block, const uint end_k) {
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
#if LOAD_VEC_A == 8
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa[0].xy;
|
||||
buf_a[buf_idx + 1] = aa[0].zw;
|
||||
buf_a[buf_idx + 2] = aa[1].xy;
|
||||
buf_a[buf_idx + 3] = aa[1].zw;
|
||||
if (ALIGNED != 0) {
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa[0].xy;
|
||||
buf_a[buf_idx + 1] = aa[0].zw;
|
||||
buf_a[buf_idx + 2] = aa[1].xy;
|
||||
buf_a[buf_idx + 3] = aa[1].zw;
|
||||
return;
|
||||
}
|
||||
#elif LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa.xy;
|
||||
buf_a[buf_idx + 1] = aa.zw;
|
||||
#else // LOAD_VEC_BATCH_A == 2
|
||||
if (ALIGNED != 0) {
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
|
||||
buf_a[buf_idx ] = aa.xy;
|
||||
buf_a[buf_idx + 1] = aa.zw;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
const uint idx = pos_a + col * p.stride_a + row * 2;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx],
|
||||
data_a[idx + 1]);
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx],
|
||||
data_a_scalar[idx + 1]);
|
||||
} else if (idx_m < p.M && block + row * 2 < end_k) {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx], 0.0f);
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx], 0.0f);
|
||||
} else {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_BF16)
|
||||
#if LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
|
||||
buf_a[buf_idx ] = aa.xy;
|
||||
buf_a[buf_idx + 1] = aa.zw;
|
||||
#else // LOAD_VEC_BATCH_A == 2
|
||||
if (ALIGNED != 0) {
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
|
||||
buf_a[buf_idx ] = aa.xy;
|
||||
buf_a[buf_idx + 1] = aa.zw;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
const uint idx = pos_a + col * p.stride_a + row * 2;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]),
|
||||
TO_FLOAT_TYPE(data_a[idx + 1]));
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]),
|
||||
TO_FLOAT_TYPE(data_a_scalar[idx + 1]));
|
||||
} else if (idx_m < p.M && block + row * 2 < end_k) {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]), 0.0f);
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]), 0.0f);
|
||||
} else {
|
||||
buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
@@ -526,75 +533,85 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
#if !defined(MUL_MAT_ID)
|
||||
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint block, const uint end_k) {
|
||||
#if LOAD_VEC_B == 8
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].xy;
|
||||
buf_b[buf_idx + 1] = bb[0].zw;
|
||||
buf_b[buf_idx + 2] = bb[1].xy;
|
||||
buf_b[buf_idx + 3] = bb[1].zw;
|
||||
if (ALIGNED != 0) {
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].xy;
|
||||
buf_b[buf_idx + 1] = bb[0].zw;
|
||||
buf_b[buf_idx + 2] = bb[1].xy;
|
||||
buf_b[buf_idx + 3] = bb[1].zw;
|
||||
return;
|
||||
}
|
||||
#elif LOAD_VEC_B == 4
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
if (ALIGNED != 0) {
|
||||
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
#if defined(DATA_B_BF16)
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
#else
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.xy;
|
||||
buf_b[buf_idx + 1] = bb.zw;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.xy;
|
||||
buf_b[buf_idx + 1] = bb.zw;
|
||||
#else // LOAD_VEC_BATCH_B == 2
|
||||
const uint idx = pos_b + col * p.stride_b + row * 2;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
if (idx_n < p.N && block + row * 2 + 1 < end_k) {
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
|
||||
TO_FLOAT_TYPE(data_b[idx + 1]));
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]),
|
||||
TO_FLOAT_TYPE(data_b_scalar[idx + 1]));
|
||||
} else if (idx_n < p.N && block + row * 2 < end_k) {
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f);
|
||||
} else {
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint block, const uint end_k) {
|
||||
#if LOAD_VEC_B == 8
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].xy;
|
||||
buf_b[buf_idx + 1] = bb[0].zw;
|
||||
buf_b[buf_idx + 2] = bb[1].xy;
|
||||
buf_b[buf_idx + 3] = bb[1].zw;
|
||||
if (ALIGNED != 0) {
|
||||
// Not supported for b_type bf16 because bf16mat2x4 does not exist
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
|
||||
buf_b[buf_idx + 0] = bb[0].xy;
|
||||
buf_b[buf_idx + 1] = bb[0].zw;
|
||||
buf_b[buf_idx + 2] = bb[1].xy;
|
||||
buf_b[buf_idx + 3] = bb[1].zw;
|
||||
return;
|
||||
}
|
||||
#elif LOAD_VEC_B == 4
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
if (ALIGNED != 0) {
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
|
||||
#if defined(DATA_B_BF16)
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
|
||||
#else
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
|
||||
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.xy;
|
||||
buf_b[buf_idx + 1] = bb.zw;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
buf_b[buf_idx + 0] = bb.xy;
|
||||
buf_b[buf_idx + 1] = bb.zw;
|
||||
#else // LOAD_VEC_BATCH_B == 2
|
||||
const uint row_i = ic * BN + col;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row;
|
||||
if (row_i < _ne1 && block + row * 2 + 1 < end_k) {
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
|
||||
TO_FLOAT_TYPE(data_b[idx + 1]));
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]),
|
||||
TO_FLOAT_TYPE(data_b_scalar[idx + 1]));
|
||||
} else if (row_i < _ne1 && block + row * 2 < end_k) {
|
||||
const u16vec2 row_idx = row_ids[col];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f);
|
||||
} else {
|
||||
buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,26 +1,26 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.glsl"
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 512
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared vec2 sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
const uint a_base = get_aoffset() + src0_idx(row * p.ne00);
|
||||
const uint d_base = get_doffset() + dst_idx(row * p.ne10);
|
||||
|
||||
sum[tid] = vec2(0.0f, 0.0f);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
const float xi = float(data_a[row*p.KX + col]);
|
||||
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
||||
const float xi = float(data_a[a_base + i0*p.nb00]);
|
||||
sum[tid].x += xi;
|
||||
sum[tid].y += xi * xi;
|
||||
}
|
||||
@@ -34,11 +34,11 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float mean = sum[0].x / p.KX;
|
||||
const float var = sum[0].y / p.KX - mean * mean;
|
||||
const float mean = sum[0].x / p.ne00;
|
||||
const float var = sum[0].y / p.ne00 - mean * mean;
|
||||
const float inv_std = inversesqrt(var + p.param1);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
|
||||
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
||||
data_d[d_base + i0*p.nb10] = D_TYPE((float(data_a[a_base + i0*p.nb00]) - mean) * inv_std);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sqrt(val));
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
|
||||
}
|
||||
@@ -17,6 +17,30 @@ float op_neg(float x) {
|
||||
return -x;
|
||||
}
|
||||
|
||||
float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
float op_sqrt(float x) {
|
||||
return sqrt(x);
|
||||
}
|
||||
|
||||
float op_sin(float x) {
|
||||
return sin(x);
|
||||
}
|
||||
|
||||
float op_cos(float x) {
|
||||
return cos(x);
|
||||
}
|
||||
|
||||
float op_clamp(float x) {
|
||||
return clamp(x, p.param1, p.param2);
|
||||
}
|
||||
|
||||
float op_leaky_relu(float x) {
|
||||
return max(x, 0.0f) + min(x, 0.0f) * p.param1;
|
||||
}
|
||||
|
||||
float op_step(float x) {
|
||||
return x >= 0.0f ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
@@ -34,6 +35,9 @@
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
// Set when any shader subprocess fails (non-zero exit / stderr / launch failure) so the
|
||||
// build is stopped instead of silently producing a broken libggml-vulkan. (issue #24393)
|
||||
static std::atomic<bool> compile_failed{false};
|
||||
std::locale c_locale("C");
|
||||
|
||||
std::string GLSLC = "glslc";
|
||||
@@ -78,7 +82,7 @@ enum MatMulIdType {
|
||||
|
||||
namespace {
|
||||
|
||||
void execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
int execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
#ifdef _WIN32
|
||||
HANDLE stdout_read, stdout_write;
|
||||
HANDLE stderr_read, stderr_write;
|
||||
@@ -127,8 +131,11 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
||||
CloseHandle(stdout_read);
|
||||
CloseHandle(stderr_read);
|
||||
WaitForSingleObject(pi.hProcess, INFINITE);
|
||||
DWORD exit_code = 1;
|
||||
GetExitCodeProcess(pi.hProcess, &exit_code);
|
||||
CloseHandle(pi.hProcess);
|
||||
CloseHandle(pi.hThread);
|
||||
return (int)exit_code;
|
||||
#else
|
||||
int stdout_pipe[2];
|
||||
int stderr_pipe[2];
|
||||
@@ -175,7 +182,9 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
||||
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
waitpid(pid, nullptr, 0);
|
||||
int status = 0;
|
||||
waitpid(pid, &status, 0);
|
||||
return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -372,13 +381,14 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
execute_command(cmd, stdout_str, stderr_str);
|
||||
if (!stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << "\n\n";
|
||||
int exit_code = execute_command(cmd, stdout_str, stderr_str);
|
||||
if (exit_code != 0 || !stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << " (exit code " << exit_code << ")\n\n";
|
||||
for (const auto& part : cmd) {
|
||||
std::cerr << part << " ";
|
||||
}
|
||||
std::cerr << "\n\n" << stderr_str << std::endl;
|
||||
compile_failed = true;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -398,6 +408,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
|
||||
shader_fnames.push_back(std::make_pair(name, out_path));
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
||||
compile_failed = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -539,11 +550,9 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
};
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
string_to_spv(shader_name + "_f32_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
string_to_spv(shader_name + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
// bf16
|
||||
{
|
||||
@@ -565,8 +574,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
#endif
|
||||
{
|
||||
if (!dot2) {
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE_SCALAR", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -583,8 +591,6 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
}
|
||||
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? "1" : load_vec_quant;
|
||||
// For aligned matmul loads
|
||||
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;
|
||||
|
||||
@@ -597,13 +603,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
|
||||
// don't generate f32 variants for coopmat2
|
||||
if (!coopmat2) {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE_SCALAR", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
@@ -850,21 +854,12 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}});
|
||||
string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("get_rows_back_f32", "get_rows_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}});
|
||||
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sqrt_f32", "sqrt.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("concat_i8", "concat.comp", {{"A_TYPE", "uint8_t"}, {"B_TYPE", "uint8_t"}, {"D_TYPE", "uint8_t"}});
|
||||
@@ -891,6 +886,18 @@ void process_shaders() {
|
||||
string_to_spv("silu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_silu"}});
|
||||
string_to_spv("relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_relu"}});
|
||||
string_to_spv("relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_relu"}});
|
||||
string_to_spv("sqr_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sqr"}});
|
||||
string_to_spv("sqr_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sqr"}});
|
||||
string_to_spv("sqrt_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sqrt"}});
|
||||
string_to_spv("sqrt_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sqrt"}});
|
||||
string_to_spv("sin_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sin"}});
|
||||
string_to_spv("sin_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sin"}});
|
||||
string_to_spv("cos_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_cos"}});
|
||||
string_to_spv("cos_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_cos"}});
|
||||
string_to_spv("clamp_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_clamp"}});
|
||||
string_to_spv("clamp_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_clamp"}});
|
||||
string_to_spv("leaky_relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_leaky_relu"}});
|
||||
string_to_spv("leaky_relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_leaky_relu"}});
|
||||
string_to_spv("neg_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_neg"}});
|
||||
string_to_spv("neg_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_neg"}});
|
||||
string_to_spv("tanh_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}});
|
||||
@@ -948,7 +955,6 @@ void process_shaders() {
|
||||
string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
@@ -1060,6 +1066,31 @@ void process_shaders() {
|
||||
}
|
||||
}
|
||||
|
||||
for (auto unroll : {false, true}) {
|
||||
for (auto a_f16 : {false, true}) {
|
||||
std::map<std::string, std::string> defines = {
|
||||
{"A_TYPE", a_f16 ? "float16_t" : "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"},
|
||||
{"UNROLL", unroll ? "[[unroll]]" : ""},
|
||||
};
|
||||
std::string name = std::string("conv3d") + (a_f16 ? "_f16" : "") + "_f32";
|
||||
string_to_spv(name + (unroll ? "_unroll" : ""), "conv3d_mm.comp", defines);
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
auto cm2_defines = defines;
|
||||
cm2_defines["COOPMAT2"] = "1";
|
||||
string_to_spv(name, "conv3d_mm.comp", cm2_defines, true, false, true);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
auto cm1_defines = defines;
|
||||
cm1_defines["COOPMAT"] = "1";
|
||||
string_to_spv(name, "conv3d_mm.comp", cm1_defines, true, true, false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||
string_to_spv("conv2d_dw_whcn_f16_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
@@ -1251,6 +1282,11 @@ int main(int argc, char** argv) {
|
||||
|
||||
process_shaders();
|
||||
|
||||
if (compile_failed) {
|
||||
std::cerr << "vulkan-shaders-gen: one or more shaders failed to compile" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
write_output_files();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
|
||||
@@ -4270,7 +4270,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
supports_op = (op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32) && ggml_is_contiguous_rows(src0);
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
|
||||
|
||||
@@ -57,19 +57,25 @@ oppoll=
|
||||
opflt=
|
||||
[ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"
|
||||
|
||||
opfuse=
|
||||
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
|
||||
|
||||
vmem=
|
||||
[ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"
|
||||
|
||||
mbuf=
|
||||
[ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"
|
||||
|
||||
mmsel=
|
||||
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
|
||||
|
||||
set -x
|
||||
|
||||
adb $adbserial $adbhost shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $opfuse $vmem $mbuf $mmsel \
|
||||
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
|
||||
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||
--ctx-size 8192 --ubatch-size 1024 -fa on \
|
||||
|
||||
@@ -51,6 +51,12 @@ opqueue=
|
||||
oppoll=
|
||||
[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"
|
||||
|
||||
opfuse=
|
||||
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
|
||||
|
||||
mmsel=
|
||||
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
|
||||
|
||||
set -x
|
||||
|
||||
tool=$1; shift
|
||||
@@ -59,5 +65,5 @@ adb $adbserial $adbhost shell " \
|
||||
cd $basedir; ulimit -c unlimited; \
|
||||
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \
|
||||
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opfuse $mmsel ./$branch/bin/$tool $@ \
|
||||
"
|
||||
|
||||
@@ -26,7 +26,7 @@ COL_MAP = {
|
||||
}
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
trace_pattern = re.compile(
|
||||
@@ -93,9 +93,40 @@ def parse_log(file_path, pmu_index=None):
|
||||
+ int(ts_match.group('us'))
|
||||
)
|
||||
|
||||
op_match = op_pattern.search(line)
|
||||
if "|" in line and "profile-op" in line:
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
prefix = parts[0]
|
||||
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
|
||||
if not prefix_match:
|
||||
continue
|
||||
|
||||
if len(parts) == 7:
|
||||
dims, types, timings = parts[2], parts[3], parts[6]
|
||||
elif len(parts) == 6:
|
||||
dims, types, timings = parts[2], parts[3], parts[5]
|
||||
else:
|
||||
continue
|
||||
|
||||
timing_match = re.search(
|
||||
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
|
||||
timings
|
||||
)
|
||||
if not timing_match:
|
||||
continue
|
||||
|
||||
op_match = timing_match
|
||||
op_name = prefix_match.group("op_name")
|
||||
else:
|
||||
op_match = op_pattern.search(line)
|
||||
if op_match:
|
||||
op_name = op_match.group('op_name')
|
||||
dims = op_match.group('dims').strip()
|
||||
types = op_match.group('types').strip()
|
||||
else:
|
||||
op_match = None
|
||||
|
||||
if op_match:
|
||||
pmu_raw = op_match.group('pmu')
|
||||
pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None
|
||||
pmu_val = None
|
||||
if pmu_raw and pmu_index is not None:
|
||||
try:
|
||||
@@ -105,7 +136,7 @@ def parse_log(file_path, pmu_index=None):
|
||||
except (ValueError, IndexError):
|
||||
pmu_val = None
|
||||
|
||||
evt_raw = op_match.group('evt')
|
||||
evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None
|
||||
evt_val = None
|
||||
if evt_raw:
|
||||
try:
|
||||
@@ -122,9 +153,9 @@ def parse_log(file_path, pmu_index=None):
|
||||
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
|
||||
|
||||
current_op = {
|
||||
'name': op_match.group('op_name'),
|
||||
'dims': op_match.group('dims').strip(),
|
||||
'types': op_match.group('types').strip(),
|
||||
'name': op_name,
|
||||
'dims': dims,
|
||||
'types': types,
|
||||
'op_text': op_text,
|
||||
'usec': int(op_match.group('usec')),
|
||||
'cycles': int(op_match.group('cycles')),
|
||||
|
||||
@@ -12,7 +12,7 @@ from collections import defaultdict
|
||||
logger = logging.getLogger("ggml-hexagon-trace")
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+)\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+?)\s+:\s+(?:(?P<params>.*?)\s+:\s+)?(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
trace_pattern = re.compile(
|
||||
@@ -66,7 +66,40 @@ def parse_log(file_path):
|
||||
|
||||
for line in f:
|
||||
line_idx += 1
|
||||
op_match = op_pattern.search(line)
|
||||
if "|" in line and "profile-op" in line:
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
prefix = parts[0]
|
||||
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
|
||||
if not prefix_match:
|
||||
continue
|
||||
|
||||
if len(parts) == 7:
|
||||
dims, types, strides, params, timings = parts[2], parts[3], parts[4], parts[5], parts[6]
|
||||
elif len(parts) == 6:
|
||||
dims, types, strides, params, timings = parts[2], parts[3], parts[4], "", parts[5]
|
||||
else:
|
||||
continue
|
||||
|
||||
timing_match = re.search(
|
||||
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
|
||||
timings
|
||||
)
|
||||
if not timing_match:
|
||||
continue
|
||||
|
||||
op_match = timing_match
|
||||
op_name = prefix_match.group("op_name")
|
||||
else:
|
||||
op_match = op_pattern.search(line)
|
||||
if op_match:
|
||||
op_name = op_match.group('op_name')
|
||||
dims = op_match.group('dims').strip() if op_match.group('dims') else ''
|
||||
types = op_match.group('types').strip() if op_match.group('types') else ''
|
||||
strides = op_match.group('strides').strip() if op_match.group('strides') else ''
|
||||
params = op_match.group('params').strip() if ('params' in op_match.groupdict() and op_match.group('params')) else ''
|
||||
else:
|
||||
op_match = None
|
||||
|
||||
if op_match:
|
||||
cycles_start_raw = op_match.group('start')
|
||||
unwrapped_cycles_start = None
|
||||
@@ -77,10 +110,11 @@ def parse_log(file_path):
|
||||
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
|
||||
|
||||
current_op = {
|
||||
'name': op_match.group('op_name'),
|
||||
'dims': op_match.group('dims').strip() if op_match.group('dims') else '',
|
||||
'types': op_match.group('types').strip() if op_match.group('types') else '',
|
||||
'strides': op_match.group('strides').strip() if op_match.group('strides') else '',
|
||||
'name': op_name,
|
||||
'dims': dims,
|
||||
'types': types,
|
||||
'strides': strides,
|
||||
'params': params,
|
||||
'op_text': op_text,
|
||||
'usec': int(op_match.group('usec')),
|
||||
'cycles': int(op_match.group('cycles')),
|
||||
@@ -397,6 +431,8 @@ def generate_perfetto_trace(filtered_ops, output_path):
|
||||
debug_annots.append(make_debug_annotation("line", int_val=op['line_num']))
|
||||
if 'strides' in op and op['strides']:
|
||||
debug_annots.append(make_debug_annotation("strides", string_val=op['strides']))
|
||||
if 'params' in op and op['params'] and op['params'] != '----':
|
||||
debug_annots.append(make_debug_annotation("params", string_val=op['params']))
|
||||
|
||||
# Slice Begin
|
||||
evt_begin = make_track_event(1, 2, name=f"{op['name']} ({op['dims']})", category="operator", debug_annotations=debug_annots)
|
||||
|
||||
+14
-4
@@ -190,7 +190,15 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
|
||||
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
|
||||
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
|
||||
|
||||
bx = ggml_concat(ctx0, conv, bx, 0);
|
||||
// causal prepends the state, non-causal pads symmetrically for a centered window
|
||||
if (hparams.causal_attn) {
|
||||
bx = ggml_concat(ctx0, conv, bx, 0);
|
||||
} else {
|
||||
const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2;
|
||||
auto * left = ggml_cont(ctx0,
|
||||
ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0]));
|
||||
bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
|
||||
|
||||
// last d_conv columns is a new conv state
|
||||
@@ -266,10 +274,12 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
if (!cparams.embeddings) {
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
res->t_logits = cur;
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
@@ -199,7 +199,6 @@ llama_build_and_test(test-jinja.cpp)
|
||||
llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
|
||||
llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
llama_build_and_test(test-chat-template.cpp)
|
||||
llama_build_and_test(test-json-partial.cpp)
|
||||
llama_build_and_test(test-log.cpp)
|
||||
llama_build_and_test(
|
||||
test-peg-parser.cpp
|
||||
|
||||
@@ -3298,21 +3298,29 @@ struct test_norm : public test_case {
|
||||
const std::array<int64_t, 4> ne;
|
||||
const bool v; // whether a is a non-contiguous view
|
||||
const float eps;
|
||||
const bool noncontig_rows;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne, v, eps);
|
||||
return VARS_TO_STR5(type, ne, v, eps, noncontig_rows);
|
||||
}
|
||||
|
||||
test_norm(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
||||
bool v = false,
|
||||
float eps = 1e-6f)
|
||||
: type(type), ne(ne), v(v), eps(eps) {}
|
||||
float eps = 1e-6f,
|
||||
bool noncontig_rows = false)
|
||||
: type(type), ne(ne), v(v), eps(eps), noncontig_rows(noncontig_rows) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
const std::array<int64_t, 4> ne_a = noncontig_rows ?
|
||||
std::array<int64_t, 4>{ ne[1], ne[0], ne[2], ne[3] } : ne;
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
if (noncontig_rows) {
|
||||
a = ggml_permute(ctx, a, 1, 0, 2, 3);
|
||||
ggml_set_name(a, "permuted a");
|
||||
}
|
||||
if (v) {
|
||||
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
ggml_set_name(a, "view of a");
|
||||
@@ -6193,21 +6201,29 @@ struct test_l2_norm : public test_case {
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float eps;
|
||||
bool v;
|
||||
bool noncontig_rows;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne, eps, v);
|
||||
return VARS_TO_STR5(type, ne, eps, v, noncontig_rows);
|
||||
}
|
||||
|
||||
test_l2_norm(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {64, 64, 320, 1},
|
||||
float eps = 1e-12f,
|
||||
bool v = false)
|
||||
: type(type), ne(ne), eps(eps), v(v) {}
|
||||
bool v = false,
|
||||
bool noncontig_rows = false)
|
||||
: type(type), ne(ne), eps(eps), v(v), noncontig_rows(noncontig_rows) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
const std::array<int64_t, 4> ne_a = noncontig_rows ?
|
||||
std::array<int64_t, 4>{ ne[1], ne[0], ne[2], ne[3] } : ne;
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
if (noncontig_rows) {
|
||||
a = ggml_permute(ctx, a, 1, 0, 2, 3);
|
||||
ggml_set_name(a, "permuted a");
|
||||
}
|
||||
if (v) {
|
||||
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
ggml_set_name(a, "view of a");
|
||||
@@ -8282,9 +8298,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
|
||||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
|
||||
}
|
||||
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, false, eps, true));
|
||||
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false, true));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8402,6 +8420,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_MXFP4, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
|
||||
|
||||
|
||||
#if 0
|
||||
{
|
||||
// Test paths in OpenCL
|
||||
@@ -8576,6 +8599,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
|
||||
// gpt-oss issue with Vulkan mmq_id
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
||||
|
||||
for (ggml_type type_a : all_types) {
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
|
||||
@@ -9272,6 +9296,34 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
}
|
||||
}
|
||||
|
||||
struct conv3d_perf_case {
|
||||
int N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1, s2, p0, p1, p2, d0, d1, d2;
|
||||
};
|
||||
|
||||
const std::vector<conv3d_perf_case> conv3d_cases = {
|
||||
{1, 320, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1280, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 320, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1280, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 320, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
#if 0
|
||||
// too slow on some devices
|
||||
{1, 1280, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 320, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 640, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
|
||||
#endif
|
||||
};
|
||||
|
||||
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
for (const conv3d_perf_case & c : conv3d_cases) {
|
||||
test_cases.emplace_back(new test_conv_3d(
|
||||
c.N, c.IC, c.ID, c.IH, c.IW,
|
||||
c.OC, c.KD, c.KH, c.KW,
|
||||
c.s0, c.s1, c.s2, c.p0, c.p1, c.p2, c.d0, c.d1, c.d2,
|
||||
kernel_type));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
#include "common.h"
|
||||
#include "json-partial.h"
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
template <class T> static void assert_equals(const T & expected, const T & actual) {
|
||||
if (expected != actual) {
|
||||
std::cerr << "Expected: " << expected << std::endl;
|
||||
std::cerr << "Actual: " << actual << std::endl;
|
||||
std::cerr << std::flush;
|
||||
throw std::runtime_error("Test failed");
|
||||
}
|
||||
}
|
||||
|
||||
static void test_json_healing() {
|
||||
auto parse = [](const std::string & str) {
|
||||
std::cerr << "# Parsing: " << str << '\n';
|
||||
std::string::const_iterator it = str.begin();
|
||||
const auto end = str.end();
|
||||
common_json out;
|
||||
std::string healing_marker = "$llama.cpp.json$";
|
||||
if (common_json_parse(it, end, healing_marker, out)) {
|
||||
auto dump = out.json.dump();
|
||||
std::cerr << "Parsed: " << dump << '\n';
|
||||
std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
|
||||
std::string result;
|
||||
if (!out.healing_marker.json_dump_marker.empty()) {
|
||||
auto i = dump.find(out.healing_marker.json_dump_marker);
|
||||
if (i == std::string::npos) {
|
||||
throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
|
||||
}
|
||||
result = dump.substr(0, i);
|
||||
} else {
|
||||
result = dump;
|
||||
}
|
||||
std::cerr << "Result: " << result << '\n';
|
||||
if (string_starts_with(str, result)) {
|
||||
std::cerr << "Failure!\n";
|
||||
}
|
||||
// return dump;
|
||||
} else {
|
||||
throw std::runtime_error("Failed to parse: " + str);
|
||||
}
|
||||
|
||||
};
|
||||
auto parse_all = [&](const std::string & str) {
|
||||
for (size_t i = 1; i < str.size(); i++) {
|
||||
parse(str.substr(0, i));
|
||||
}
|
||||
};
|
||||
parse_all("{\"a\": \"b\"}");
|
||||
parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
|
||||
|
||||
parse_all("[{\"a\": \"b\"}]");
|
||||
|
||||
auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
|
||||
for (const auto & input : inputs) {
|
||||
common_json out;
|
||||
assert_equals(true, common_json_parse(input, "$foo", out));
|
||||
assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
|
||||
assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
|
||||
}
|
||||
};
|
||||
// No healing needed:
|
||||
test(
|
||||
{
|
||||
R"([{"a":"b"}, "y"])",
|
||||
},
|
||||
R"([{"a":"b"},"y"])",
|
||||
""
|
||||
);
|
||||
// Partial literals can't be healed:
|
||||
test(
|
||||
{
|
||||
R"([1)",
|
||||
R"([tru)",
|
||||
R"([n)",
|
||||
R"([nul)",
|
||||
R"([23.2)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a": 1)",
|
||||
R"({"a": tru)",
|
||||
R"({"a": n)",
|
||||
R"({"a": nul)",
|
||||
R"({"a": 23.2)",
|
||||
},
|
||||
R"({"a":"$foo"})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({)",
|
||||
},
|
||||
R"({"$foo":1})",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([)",
|
||||
},
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Healing right after a full literal
|
||||
test(
|
||||
{
|
||||
R"(1 )",
|
||||
},
|
||||
R"(1)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(true)",
|
||||
R"(true )",
|
||||
},
|
||||
R"(true)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"(null)",
|
||||
R"(null )",
|
||||
},
|
||||
R"(null)",
|
||||
""
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([1 )",
|
||||
},
|
||||
R"([1,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{})",
|
||||
R"([{} )",
|
||||
},
|
||||
R"([{},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true)",
|
||||
},
|
||||
// TODO: detect the true/false/null literal was complete
|
||||
R"(["$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true )",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([true,)",
|
||||
},
|
||||
R"([true,"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
// Test nesting
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [{)",
|
||||
},
|
||||
R"([{"a":[{"b":[{"$foo":1}]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": [{"b": [)",
|
||||
},
|
||||
R"([{"a":[{"b":["$foo"]}]}])",
|
||||
R"("$foo)"
|
||||
);
|
||||
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"})",
|
||||
R"([{"a": "b"} )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"(,"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"([{"a": "b"},)",
|
||||
R"([{"a": "b"}, )",
|
||||
},
|
||||
R"([{"a":"b"},"$foo"])",
|
||||
R"("$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code)",
|
||||
},
|
||||
R"({"code$foo":1})",
|
||||
R"($foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code\)",
|
||||
},
|
||||
R"({"code\\$foo":1})",
|
||||
R"(\$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "code")",
|
||||
},
|
||||
R"({"code":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({ "key")",
|
||||
},
|
||||
R"({"key":"$foo"})",
|
||||
R"(:"$foo)"
|
||||
);
|
||||
// Test unicode escape sequences
|
||||
test(
|
||||
{
|
||||
R"({"a":"\u)",
|
||||
},
|
||||
R"({"a":"\u0000$foo"})",
|
||||
R"(0000$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\u00)",
|
||||
},
|
||||
R"({"a":"\u0000$foo"})",
|
||||
R"(00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud300)",
|
||||
},
|
||||
R"({"a":"\ud300$foo"})",
|
||||
R"($foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(\udc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(udc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\u)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"(dc00$foo)"
|
||||
);
|
||||
test(
|
||||
{
|
||||
R"({"a":"\ud800\udc00)",
|
||||
},
|
||||
R"({"a":"\ud800\udc00$foo"})",
|
||||
R"($foo)"
|
||||
);
|
||||
}
|
||||
|
||||
int main() {
|
||||
test_json_healing();
|
||||
std::cerr << "All tests passed.\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -2,13 +2,11 @@
|
||||
|
||||
set(TARGET llama-cli-impl)
|
||||
|
||||
add_library(${TARGET} cli.cpp
|
||||
cli-client.cpp
|
||||
cli-context.cpp)
|
||||
add_library(${TARGET} cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
|
||||
target_link_libraries(${TARGET} PUBLIC llama-server-impl llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
|
||||
@@ -1,164 +0,0 @@
|
||||
#include "cli-client.h"
|
||||
|
||||
#include "http.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
// generation can stall for a long time during prompt processing, so the
|
||||
// read timeout must be generous
|
||||
static constexpr time_t CLI_HTTP_READ_TIMEOUT_SEC = 3600;
|
||||
|
||||
// upper bound for the accumulated response body kept for error reporting
|
||||
static constexpr size_t CLI_HTTP_MAX_ERROR_BODY = 1024 * 1024;
|
||||
|
||||
// returns the path with the base url's path prefix prepended (if any)
|
||||
static std::string join_path(const common_http_url & parts, const std::string & path) {
|
||||
if (parts.path.empty() || parts.path == "/") {
|
||||
return path;
|
||||
}
|
||||
std::string prefix = parts.path;
|
||||
if (prefix.back() == '/') {
|
||||
prefix.pop_back();
|
||||
}
|
||||
return prefix + path;
|
||||
}
|
||||
|
||||
json cli_client::get(const std::string & path) {
|
||||
auto [cli, parts] = common_http_client(server_base);
|
||||
cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
|
||||
auto path_with_model = path + (model.empty() ? "" : ("?model=" + model));
|
||||
auto res = cli.Get(join_path(parts, path_with_model));
|
||||
if (!res) {
|
||||
throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
|
||||
}
|
||||
if (res->status < 200 || res->status >= 300) {
|
||||
throw std::runtime_error("GET " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
|
||||
}
|
||||
json result = json::parse(res->body, nullptr, false);
|
||||
if (result.is_discarded()) {
|
||||
throw std::runtime_error("GET " + path + " returned invalid JSON");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
json cli_client::post(const std::string & path, const json & body) {
|
||||
auto [cli, parts] = common_http_client(server_base);
|
||||
cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
|
||||
auto body_with_model = body;
|
||||
if (!model.empty()) {
|
||||
body_with_model["model"] = model;
|
||||
}
|
||||
auto res = cli.Post(join_path(parts, path), body_with_model.dump(), "application/json");
|
||||
if (!res) {
|
||||
throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
|
||||
}
|
||||
if (res->status < 200 || res->status >= 300) {
|
||||
throw std::runtime_error("POST " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
|
||||
}
|
||||
json result = json::parse(res->body, nullptr, false);
|
||||
if (result.is_discarded()) {
|
||||
throw std::runtime_error("POST " + path + " returned invalid JSON");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
json cli_client::post_sse(const std::string & path,
|
||||
const json & body,
|
||||
const std::function<bool()> & should_stop,
|
||||
const std::function<void(const json &)> & on_data) {
|
||||
auto [cli, parts] = common_http_client(server_base);
|
||||
cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
|
||||
|
||||
std::string pending; // buffer for incomplete SSE lines
|
||||
std::string raw_body; // accumulated body, used only for error reporting
|
||||
|
||||
auto receiver = [&](const char * data, size_t len) -> bool {
|
||||
if (should_stop()) {
|
||||
return false; // aborts the request
|
||||
}
|
||||
if (raw_body.size() < CLI_HTTP_MAX_ERROR_BODY) {
|
||||
raw_body.append(data, std::min(len, CLI_HTTP_MAX_ERROR_BODY - raw_body.size()));
|
||||
}
|
||||
pending.append(data, len);
|
||||
size_t pos;
|
||||
while ((pos = pending.find('\n')) != std::string::npos) {
|
||||
std::string line = pending.substr(0, pos);
|
||||
pending.erase(0, pos + 1);
|
||||
if (!line.empty() && line.back() == '\r') {
|
||||
line.pop_back();
|
||||
}
|
||||
if (line.rfind("data: ", 0) != 0) {
|
||||
continue;
|
||||
}
|
||||
std::string payload = line.substr(6);
|
||||
if (payload == "[DONE]") {
|
||||
continue;
|
||||
}
|
||||
json event = json::parse(payload, nullptr, false);
|
||||
if (!event.is_discarded()) {
|
||||
on_data(event);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
httplib::Headers headers = {{"Accept", "text/event-stream"}};
|
||||
auto body_with_model = body;
|
||||
if (!model.empty()) {
|
||||
body_with_model["model"] = model;
|
||||
}
|
||||
auto res = cli.Post(join_path(parts, path), headers, body_with_model.dump(), "application/json", receiver);
|
||||
|
||||
if (!res) {
|
||||
if (res.error() == httplib::Error::Canceled && should_stop()) {
|
||||
return json(); // cancelled by the user
|
||||
}
|
||||
return json {{"error", {{"message", "failed to connect to " + server_base + ": " + httplib::to_string(res.error())}}}};
|
||||
}
|
||||
if (res->status < 200 || res->status >= 300) {
|
||||
json error_body = json::parse(raw_body, nullptr, false);
|
||||
if (!error_body.is_discarded() && error_body.contains("error")) {
|
||||
return error_body;
|
||||
}
|
||||
return json {{"error", {{"message", "request failed with status " + std::to_string(res->status)}}}};
|
||||
}
|
||||
return json();
|
||||
}
|
||||
|
||||
bool cli_client::wait_health(const std::function<bool()> & is_aborted) {
|
||||
int connect_attempts = 0;
|
||||
while (!is_aborted()) {
|
||||
auto [cli, parts] = common_http_client(server_base);
|
||||
cli.set_connection_timeout(1, 0);
|
||||
auto res = cli.Get(join_path(parts, "/health"));
|
||||
if (res) {
|
||||
if (res->status == 200) {
|
||||
return true;
|
||||
}
|
||||
// any other status means the server is up but not ready yet
|
||||
// (e.g. 503 while the model is still loading)
|
||||
} else if (++connect_attempts >= 10) {
|
||||
last_error = "failed to connect to " + server_base + ": " + httplib::to_string(res.error());
|
||||
return false;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(300));
|
||||
}
|
||||
last_error = "aborted while waiting for the server to become ready";
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> cli_client::list_models() {
|
||||
json resp = get("/v1/models");
|
||||
if (!resp.contains("data") || !resp.at("data").is_array()) {
|
||||
throw std::runtime_error("invalid response from /v1/models");
|
||||
}
|
||||
std::vector<std::string> models;
|
||||
for (const auto & m : resp.at("data")) {
|
||||
if (m.contains("id") && m.at("id").is_string()) {
|
||||
models.push_back(m.at("id").get<std::string>());
|
||||
}
|
||||
}
|
||||
return models;
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
// openai-like client for CLI
|
||||
struct cli_client {
|
||||
std::string server_base; // base url, for example "http://127.0.0.1:8080"
|
||||
std::string last_error; // set when wait_health() fails
|
||||
|
||||
std::string model; // optional, set when the server has multiple models (router mode)
|
||||
|
||||
// simple GET request, returns the response json
|
||||
// throws std::runtime_error on transport error or non-2xx status
|
||||
json get(const std::string & path);
|
||||
|
||||
// simple POST request, returns the response json
|
||||
// throws std::runtime_error on transport error or non-2xx status
|
||||
json post(const std::string & path, const json & body);
|
||||
|
||||
// POST request with an SSE streaming response; on_data is invoked once
|
||||
// per "data:" event; the function returns after the stream is finished:
|
||||
// a null json on graceful exit (incl. cancellation via should_stop),
|
||||
// the error response json otherwise
|
||||
json post_sse(const std::string & path,
|
||||
const json & body,
|
||||
const std::function<bool()> & should_stop,
|
||||
const std::function<void(const json &)> & on_data);
|
||||
|
||||
// poll /health until the server is ready to accept requests
|
||||
// returns false if is_aborted returned true or the server is unreachable
|
||||
bool wait_health(const std::function<bool()> & is_aborted);
|
||||
|
||||
//
|
||||
// higher-level wrappers
|
||||
//
|
||||
|
||||
json create_chat_completion(const json & request,
|
||||
const std::function<bool()> & should_stop,
|
||||
const std::function<void(const json &)> & on_data) {
|
||||
return post_sse("/v1/chat/completions", request, should_stop, on_data);
|
||||
}
|
||||
|
||||
json get_props() {
|
||||
return get("/props");
|
||||
}
|
||||
|
||||
std::vector<std::string> list_models();
|
||||
};
|
||||
@@ -1,559 +0,0 @@
|
||||
#include "cli-context.h"
|
||||
#include "cli-view.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
std::atomic<bool> g_cli_interrupted = false;
|
||||
|
||||
static bool should_stop() {
|
||||
return g_cli_interrupted.load();
|
||||
}
|
||||
|
||||
static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
|
||||
|
||||
const char * LLAMA_ASCII_LOGO = R"(
|
||||
▄▄ ▄▄
|
||||
██ ██
|
||||
██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄
|
||||
██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██
|
||||
██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
|
||||
██ ██
|
||||
▀▀ ▀▀
|
||||
)";
|
||||
|
||||
// number of values an arg consumes on the command line
|
||||
static int arg_num_values(const common_arg & opt) {
|
||||
if (opt.value_hint_2 != nullptr) {
|
||||
return 2;
|
||||
}
|
||||
if (opt.value_hint != nullptr) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static std::string format_error_message(const json & err) {
|
||||
if (err.contains("error") && err.at("error").is_object()) {
|
||||
const auto & e = err.at("error");
|
||||
if (e.contains("message") && e.at("message").is_string()) {
|
||||
return e.at("message").get<std::string>();
|
||||
}
|
||||
}
|
||||
return err.dump();
|
||||
}
|
||||
|
||||
static std::string media_type_from_ext(const std::string & fname) {
|
||||
std::string ext = std::filesystem::path(fname).extension().string();
|
||||
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
||||
if (ext == ".wav" || ext == ".mp3") {
|
||||
return "audio";
|
||||
}
|
||||
if (ext == ".mp4" || ext == ".avi" || ext == ".mkv" || ext == ".mov" || ext == ".webm") {
|
||||
return "video";
|
||||
}
|
||||
return "image";
|
||||
}
|
||||
|
||||
bool cli_context::init() {
|
||||
view::init(params);
|
||||
|
||||
std::optional<view::spinner> spinner;
|
||||
|
||||
bool use_external_server = !params.server_base.empty();
|
||||
if (use_external_server) {
|
||||
std::string base = params.server_base;
|
||||
while (!base.empty() && base.back() == '/') {
|
||||
base.pop_back();
|
||||
}
|
||||
client.server_base = base;
|
||||
|
||||
spinner.emplace("Connecting to server at " + base);
|
||||
} else {
|
||||
if (params.model.path.empty() && params.model.url.empty() &&
|
||||
params.model.hf_repo.empty() && params.model.docker_repo.empty()) {
|
||||
view::show_error(
|
||||
"no model specified",
|
||||
"use -m <file.gguf> or -hf <user/repo> to run a local model,\n"
|
||||
"or --server-base <url> to connect to a running llama-server"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
spinner.emplace("\n\nLoading model...");
|
||||
|
||||
server.emplace();
|
||||
if (!server->start(params)) {
|
||||
view::show_error("server start failed");
|
||||
return false;
|
||||
}
|
||||
if (!server->wait_ready(should_stop)) {
|
||||
if (!should_stop()) {
|
||||
view::show_error("the server exited before becoming ready");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
client.server_base = server->address();
|
||||
}
|
||||
|
||||
// for --server-base this is the main availability check; for a spawned
|
||||
// server it is a cheap sanity check on top of the ready signal
|
||||
auto is_aborted = [this]() {
|
||||
return should_stop() || (server && !server->alive());
|
||||
};
|
||||
bool healthy = false;
|
||||
try {
|
||||
healthy = client.wait_health(is_aborted);
|
||||
} catch (const std::exception & e) {
|
||||
client.last_error = e.what();
|
||||
}
|
||||
if (!healthy) {
|
||||
if (!should_stop()) {
|
||||
view::show_error(client.last_error);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (use_external_server) {
|
||||
spinner.reset();
|
||||
if (!list_and_ask_models()) {
|
||||
return false;
|
||||
}
|
||||
// restore the spinner for the next step
|
||||
spinner.emplace("Waiting for server...");
|
||||
}
|
||||
|
||||
fetch_server_props();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void cli_context::fetch_server_props() {
|
||||
try {
|
||||
json props = client.get_props();
|
||||
model_name = props.value("model_alias", "");
|
||||
if (model_name.empty()) {
|
||||
const std::string path = props.value("model_path", "");
|
||||
if (!path.empty()) {
|
||||
model_name = std::filesystem::path(path).filename().string();
|
||||
}
|
||||
}
|
||||
build_info = props.value("build_info", "");
|
||||
if (props.contains("modalities") && props.at("modalities").is_object()) {
|
||||
const auto & modalities = props.at("modalities");
|
||||
has_vision = modalities.value("vision", false);
|
||||
has_audio = modalities.value("audio", false);
|
||||
has_video = modalities.value("video", false);
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
// /props can be disabled on remote servers; not fatal
|
||||
LOG_DBG("failed to fetch /props: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
bool cli_context::list_and_ask_models() {
|
||||
auto models = client.list_models();
|
||||
|
||||
// only one model: use it without asking
|
||||
if (models.size() == 1) {
|
||||
model_name = models[0];
|
||||
client.model = model_name;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string message = "\nAvailable models:";
|
||||
if (!models.empty()) {
|
||||
for (size_t i = 0; i < models.size(); ++i) {
|
||||
message += "\n " + std::to_string(i + 1) + ". " + models[i];
|
||||
}
|
||||
}
|
||||
message += "\n";
|
||||
view::show_message(message);
|
||||
std::string selection;
|
||||
while (selection.empty()) {
|
||||
if (should_stop()) {
|
||||
return false;
|
||||
}
|
||||
view::user_turn user_turn;
|
||||
selection = user_turn.read_input(false, "Select model by number: ");
|
||||
if (selection.empty()) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
size_t idx = std::stoul(selection);
|
||||
if (idx > 0 && idx <= models.size()) {
|
||||
model_name = models[idx - 1];
|
||||
client.model = model_name;
|
||||
view::show_message("Selected model: " + model_name);
|
||||
break;
|
||||
}
|
||||
} catch (...) {
|
||||
// ignore
|
||||
}
|
||||
view::show_error("Invalid selection. Please enter a valid number.");
|
||||
selection.clear();
|
||||
continue;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void cli_context::add_system_prompt() {
|
||||
if (!params.system_prompt.empty()) {
|
||||
messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", params.system_prompt}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void cli_context::push_user_message(const std::string & text) {
|
||||
json content;
|
||||
if (pending_media.empty()) {
|
||||
content = text;
|
||||
} else {
|
||||
// multimodal message: media parts first, then the text
|
||||
content = pending_media;
|
||||
content.push_back({
|
||||
{"type", "text"},
|
||||
{"text", text}
|
||||
});
|
||||
pending_media = json::array();
|
||||
}
|
||||
messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", content}
|
||||
});
|
||||
}
|
||||
|
||||
bool cli_context::stage_media_file(const std::string & fname, const std::string & type) {
|
||||
std::ifstream file(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
return false;
|
||||
}
|
||||
std::string data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
std::string encoded = base64::encode(data);
|
||||
|
||||
if (type == "audio") {
|
||||
std::string ext = std::filesystem::path(fname).extension().string();
|
||||
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
||||
pending_media.push_back({
|
||||
{"type", "input_audio"},
|
||||
{"input_audio", {
|
||||
{"data", encoded},
|
||||
{"format", ext == ".mp3" ? "mp3" : "wav"}
|
||||
}}
|
||||
});
|
||||
} else if (type == "video") {
|
||||
pending_media.push_back({
|
||||
{"type", "input_video"},
|
||||
{"input_video", {
|
||||
{"data", encoded}
|
||||
}}
|
||||
});
|
||||
} else {
|
||||
// the server detects the actual image type from the data
|
||||
pending_media.push_back({
|
||||
{"type", "image_url"},
|
||||
{"image_url", {
|
||||
{"url", "data:image/unknown;base64," + encoded}
|
||||
}}
|
||||
});
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cli_context::generate_completion(std::string & assistant_content, cli_timings & timings) {
|
||||
json body = {
|
||||
{"messages", messages},
|
||||
{"stream", true},
|
||||
// in order to get timings even when we cancel mid-way
|
||||
{"timings_per_token", true},
|
||||
};
|
||||
|
||||
bool stream_error = false;
|
||||
|
||||
view::assistant_turn a;
|
||||
|
||||
json err = client.create_chat_completion(body, should_stop, [&](const json & chunk) {
|
||||
if (chunk.contains("error")) {
|
||||
stream_error = true;
|
||||
view::show_error(format_error_message(chunk));
|
||||
return;
|
||||
}
|
||||
if (chunk.contains("timings")) {
|
||||
const auto & t = chunk.at("timings");
|
||||
timings.prompt_per_second = t.value("prompt_per_second", 0.0);
|
||||
timings.predicted_per_second = t.value("predicted_per_second", 0.0);
|
||||
}
|
||||
if (!chunk.contains("choices") || !chunk.at("choices").is_array() || chunk.at("choices").empty()) {
|
||||
return;
|
||||
}
|
||||
const auto & choice = chunk.at("choices").at(0);
|
||||
if (!choice.contains("delta")) {
|
||||
return;
|
||||
}
|
||||
const auto & delta = choice.at("delta");
|
||||
if (delta.contains("reasoning_content") && delta.at("reasoning_content").is_string()) {
|
||||
const std::string text = delta.at("reasoning_content").get<std::string>();
|
||||
if (!text.empty()) {
|
||||
a.push(view::ASSISTANT_DISPLAY_MODE_REASONING, text);
|
||||
}
|
||||
}
|
||||
if (delta.contains("content") && delta.at("content").is_string()) {
|
||||
const std::string text = delta.at("content").get<std::string>();
|
||||
if (!text.empty()) {
|
||||
assistant_content += text;
|
||||
a.push(view::ASSISTANT_DISPLAY_MODE_CONTENT, text);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
g_cli_interrupted.store(false);
|
||||
|
||||
if (!err.is_null()) {
|
||||
view::show_error(format_error_message(err));
|
||||
return false;
|
||||
}
|
||||
return !stream_error;
|
||||
}
|
||||
|
||||
int cli_context::run() {
|
||||
add_system_prompt();
|
||||
|
||||
std::string modalities = "text";
|
||||
if (has_vision) {
|
||||
modalities += ", vision";
|
||||
}
|
||||
if (has_audio) {
|
||||
modalities += ", audio";
|
||||
}
|
||||
if (has_video) {
|
||||
modalities += ", video";
|
||||
}
|
||||
|
||||
std::string banner;
|
||||
banner += "\n";
|
||||
banner += LLAMA_ASCII_LOGO;
|
||||
banner += "\n";
|
||||
banner += "build : " + build_info + "\n";
|
||||
banner += "model : " + model_name + "\n";
|
||||
banner += "modalities : " + modalities + "\n";
|
||||
if (!params.system_prompt.empty()) {
|
||||
banner += "using custom system prompt\n";
|
||||
}
|
||||
banner += "\n";
|
||||
banner += "available commands:\n";
|
||||
banner += " /exit or Ctrl+C stop or exit\n";
|
||||
banner += " /regen regenerate the last response\n";
|
||||
banner += " /clear clear the chat history\n";
|
||||
banner += " /read <file> add a text file\n";
|
||||
banner += " /glob <pattern> add text files using globbing pattern\n";
|
||||
if (has_vision) {
|
||||
banner += " /image <file> add an image file\n";
|
||||
}
|
||||
if (has_audio) {
|
||||
banner += " /audio <file> add an audio file\n";
|
||||
}
|
||||
if (has_video) {
|
||||
banner += " /video <file> add a video file\n";
|
||||
}
|
||||
banner += "\n";
|
||||
|
||||
view::show_message(banner);
|
||||
|
||||
// interactive loop
|
||||
std::string cur_msg;
|
||||
|
||||
auto add_text_file = [&](const std::string & fname) -> bool {
|
||||
std::ifstream file(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
|
||||
return false;
|
||||
}
|
||||
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
cur_msg += content;
|
||||
view::show_message(string_format("Loaded text from '%s'", fname.c_str()));
|
||||
return true;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
std::string buffer;
|
||||
{
|
||||
view::user_turn user_turn;
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
buffer = user_turn.read_input(params.multiline_input);
|
||||
} else {
|
||||
// process input prompt from args
|
||||
for (auto & fname : params.image) {
|
||||
if (!stage_media_file(fname, media_type_from_ext(fname))) {
|
||||
view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
|
||||
break;
|
||||
}
|
||||
view::show_message(string_format("Loaded media from '%s'", fname.c_str()));
|
||||
}
|
||||
buffer = params.prompt;
|
||||
user_turn.echo(buffer);
|
||||
params.prompt.clear(); // only use it once
|
||||
}
|
||||
}
|
||||
|
||||
if (should_stop()) {
|
||||
g_cli_interrupted.store(false);
|
||||
break;
|
||||
}
|
||||
|
||||
// remove trailing newline
|
||||
if (!buffer.empty() && buffer.back() == '\n') {
|
||||
buffer.pop_back();
|
||||
}
|
||||
|
||||
// skip empty messages
|
||||
if (buffer.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool add_user_msg = true;
|
||||
|
||||
// process commands
|
||||
if (string_starts_with(buffer, "/exit")) {
|
||||
break;
|
||||
} else if (string_starts_with(buffer, "/regen")) {
|
||||
if (messages.size() >= 2) {
|
||||
size_t last_idx = messages.size() - 1;
|
||||
messages.erase(last_idx);
|
||||
add_user_msg = false;
|
||||
} else {
|
||||
view::show_error("No message to regenerate.");
|
||||
continue;
|
||||
}
|
||||
} else if (string_starts_with(buffer, "/clear")) {
|
||||
messages.clear();
|
||||
add_system_prompt();
|
||||
|
||||
pending_media = json::array();
|
||||
view::show_message("Chat history cleared.");
|
||||
continue;
|
||||
} else if (
|
||||
(string_starts_with(buffer, "/image ") && has_vision) ||
|
||||
(string_starts_with(buffer, "/audio ") && has_audio) ||
|
||||
(string_starts_with(buffer, "/video ") && has_video)) {
|
||||
std::string type = buffer.substr(1, 5);
|
||||
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
|
||||
std::string fname = string_strip(buffer.substr(7));
|
||||
if (!stage_media_file(fname, type)) {
|
||||
view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
|
||||
continue;
|
||||
}
|
||||
view::show_message(string_format("Loaded media from '%s'", fname.c_str()));
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/read ")) {
|
||||
std::string fname = string_strip(buffer.substr(6));
|
||||
add_text_file(fname);
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/glob ")) {
|
||||
std::error_code ec;
|
||||
size_t count = 0;
|
||||
auto curdir = std::filesystem::current_path();
|
||||
std::string pattern = string_strip(buffer.substr(6));
|
||||
std::filesystem::path rel_path;
|
||||
|
||||
auto startglob = pattern.find_first_of("![*?");
|
||||
if (startglob != std::string::npos && startglob != 0) {
|
||||
auto endpath = pattern.substr(0, startglob).find_last_of('/');
|
||||
if (endpath != std::string::npos) {
|
||||
std::string rel_pattern = pattern.substr(0, endpath);
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(rel_pattern, '~')) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
rel_pattern = home + rel_pattern.substr(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rel_path = rel_pattern;
|
||||
pattern.erase(0, endpath + 1);
|
||||
curdir /= rel_path;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
|
||||
std::filesystem::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(pattern, rel)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!add_text_file((rel_path / rel).string())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++count >= FILE_GLOB_MAX_RESULTS) {
|
||||
view::show_error(string_format("Maximum number of globbed files allowed (%zu) reached.", FILE_GLOB_MAX_RESULTS));
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
// not a command
|
||||
cur_msg += buffer;
|
||||
}
|
||||
|
||||
// generate response
|
||||
if (add_user_msg) {
|
||||
push_user_message(cur_msg);
|
||||
cur_msg.clear();
|
||||
}
|
||||
cli_timings timings;
|
||||
std::string assistant_content;
|
||||
generate_completion(assistant_content, timings);
|
||||
messages.push_back({
|
||||
{"role", "assistant"},
|
||||
{"content", assistant_content}
|
||||
});
|
||||
|
||||
if (params.show_timings) {
|
||||
view::show_info(string_format(
|
||||
"\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]",
|
||||
timings.prompt_per_second,
|
||||
timings.predicted_per_second
|
||||
));
|
||||
}
|
||||
|
||||
if (params.single_turn) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
view::show_message("\n\nExiting...");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cli_context::shutdown() {
|
||||
if (server) {
|
||||
server->stop();
|
||||
server.reset();
|
||||
}
|
||||
}
|
||||
@@ -1,65 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "cli-client.h"
|
||||
#include "cli-server.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
struct cli_timings {
|
||||
double prompt_per_second = 0.0;
|
||||
double predicted_per_second = 0.0;
|
||||
};
|
||||
|
||||
// set by the SIGINT handler; cleared once the interrupt has been handled
|
||||
extern std::atomic<bool> g_cli_interrupted;
|
||||
|
||||
struct cli_context {
|
||||
common_params params;
|
||||
|
||||
cli_client client; // always initialized
|
||||
std::optional<cli_server> server; // only set when no --server-base is given
|
||||
|
||||
json messages = json::array();
|
||||
json pending_media = json::array(); // staged multimodal content parts
|
||||
|
||||
// properties of the connected server
|
||||
// will be populated by fetch_server_props()
|
||||
std::string model_name;
|
||||
std::string build_info;
|
||||
bool has_vision = false;
|
||||
bool has_audio = false;
|
||||
bool has_video = false;
|
||||
|
||||
cli_context(const common_params & params) : params(params) {}
|
||||
~cli_context() {
|
||||
shutdown();
|
||||
}
|
||||
|
||||
// connect to --server-base or spawn a local llama-server child;
|
||||
// argc/argv are needed to forward the server-relevant args to the child
|
||||
bool init();
|
||||
|
||||
// run the interactive chat loop, returns the process exit code
|
||||
int run();
|
||||
|
||||
// stop the local server child (if any)
|
||||
void shutdown();
|
||||
|
||||
private:
|
||||
bool generate_completion(std::string & assistant_content, cli_timings & timings);
|
||||
void fetch_server_props();
|
||||
void add_system_prompt();
|
||||
void push_user_message(const std::string & text);
|
||||
|
||||
// check if server have multiple models (router mode)
|
||||
// if yes, list them then ask; do nothing otherwise
|
||||
bool list_and_ask_models();
|
||||
|
||||
// read a file and stage it as a multimodal content part; type is one of
|
||||
// "image", "audio", "video"; returns false if the file cannot be read
|
||||
bool stage_media_file(const std::string & fname, const std::string & type);
|
||||
};
|
||||
@@ -1,83 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "http.h"
|
||||
|
||||
// llama_server will be available as a dynamic library symbol
|
||||
int llama_server(common_params & params, int argc, char ** argv);
|
||||
void llama_server_terminate();
|
||||
|
||||
struct cli_server {
|
||||
std::thread th;
|
||||
int port = -1;
|
||||
std::atomic<bool> is_alive = false;
|
||||
std::atomic<bool> is_stopping = false;
|
||||
|
||||
~cli_server() {
|
||||
stop();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
if (alive() && !is_stopping.exchange(true)) {
|
||||
llama_server_terminate();
|
||||
th.join();
|
||||
}
|
||||
}
|
||||
|
||||
// spawn llama-server in a thread and interact with it via a random port
|
||||
bool start(common_params & params) {
|
||||
port = common_http_get_free_port();
|
||||
if (port <= 0) {
|
||||
fprintf(stderr, "failed to get a free port\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
is_alive.store(true, std::memory_order_release);
|
||||
|
||||
th = std::thread([&]() {
|
||||
common_params server_params = params; // copy
|
||||
server_params.port = port;
|
||||
// argc / argv are only used in router mode, we can skip them for now
|
||||
int res = llama_server(server_params, 0, nullptr);
|
||||
if (res != 0) {
|
||||
fprintf(stderr, "llama_server exited with code %d\n", res);
|
||||
}
|
||||
is_alive.store(false, std::memory_order_release);
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string address() const {
|
||||
return "http://127.0.0.1:" + std::to_string(port);
|
||||
}
|
||||
|
||||
bool wait_ready(std::function<bool()> should_stop) {
|
||||
if (!alive()) {
|
||||
return false;
|
||||
}
|
||||
while (!should_stop()) {
|
||||
auto [cli, parts] = common_http_client(address());
|
||||
cli.set_connection_timeout(1, 0);
|
||||
auto res = cli.Get("/health");
|
||||
if (res) {
|
||||
if (res->status == 200) {
|
||||
return true;
|
||||
}
|
||||
// any other status means the server is up but not ready yet
|
||||
// (e.g. 503 while the model is still loading)
|
||||
}
|
||||
if (!alive()) {
|
||||
// in case server die permanently
|
||||
return false;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool alive() const {
|
||||
return is_alive.load(std::memory_order_acquire);
|
||||
}
|
||||
};
|
||||
@@ -1,250 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <string_view>
|
||||
|
||||
// TODO?: Make this reusable, enums, docs
|
||||
static const std::array<std::string_view, 8> cmds = {
|
||||
"/audio ",
|
||||
"/clear",
|
||||
"/exit",
|
||||
"/glob ",
|
||||
"/image ",
|
||||
"/read ",
|
||||
"/regen",
|
||||
"/video ",
|
||||
};
|
||||
|
||||
static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
|
||||
std::vector<std::pair<std::string, size_t>> matches;
|
||||
std::string cmd;
|
||||
|
||||
if (line.length() > 1 && line.front() == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
|
||||
return string_starts_with(line, prefix);
|
||||
})) {
|
||||
auto it = cmds.begin();
|
||||
|
||||
while ((it = std::find_if(it, cmds.end(), [line](std::string_view cmd_line) {
|
||||
return string_starts_with(cmd_line, line);
|
||||
})) != cmds.end()) {
|
||||
matches.emplace_back(*it, it->length());
|
||||
++it;
|
||||
}
|
||||
} else {
|
||||
auto it = std::find_if(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
|
||||
return prefix.back() == ' ' && string_starts_with(line, prefix);
|
||||
});
|
||||
|
||||
if (it != cmds.end()) {
|
||||
cmd = *it;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length()));
|
||||
const std::string path_postfix = std::string(line.substr(cursor_byte_pos));
|
||||
auto cur_dir = std::filesystem::current_path();
|
||||
std::string cur_dir_str = cur_dir.string();
|
||||
std::string expanded_prefix = path_prefix;
|
||||
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(path_prefix, '~')) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
expanded_prefix = home + path_prefix.substr(1);
|
||||
}
|
||||
}
|
||||
if (string_starts_with(expanded_prefix, '/')) {
|
||||
#else
|
||||
if (std::isalpha(expanded_prefix[0]) && expanded_prefix.find(':') == 1) {
|
||||
#endif
|
||||
cur_dir = std::filesystem::path(expanded_prefix).parent_path();
|
||||
cur_dir_str.clear();
|
||||
} else if (!path_prefix.empty()) {
|
||||
cur_dir /= std::filesystem::path(path_prefix).parent_path();
|
||||
}
|
||||
|
||||
std::error_code ec;
|
||||
for (const auto & entry : std::filesystem::directory_iterator(cur_dir, ec)) {
|
||||
if (ec) {
|
||||
break;
|
||||
}
|
||||
if (!entry.exists(ec)) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string path_full = entry.path().string();
|
||||
std::string path_entry = !cur_dir_str.empty() && string_starts_with(path_full, cur_dir_str) ? path_full.substr(cur_dir_str.length() + 1) : path_full;
|
||||
|
||||
if (entry.is_directory(ec)) {
|
||||
path_entry.push_back(std::filesystem::path::preferred_separator);
|
||||
}
|
||||
|
||||
if (expanded_prefix.empty() || string_starts_with(path_entry, expanded_prefix)) {
|
||||
const std::string updated_line = cmd + path_entry;
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (matches.empty()) {
|
||||
const std::string updated_line = cmd + path_prefix;
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
// Add the longest common prefix
|
||||
if (!expanded_prefix.empty() && matches.size() > 1) {
|
||||
const std::string_view match0(matches[0].first);
|
||||
const std::string_view match1(matches[1].first);
|
||||
auto it = std::mismatch(match0.begin(), match0.end(), match1.begin(), match1.end());
|
||||
size_t len = it.first - match0.begin();
|
||||
|
||||
for (size_t i = 2; i < matches.size(); ++i) {
|
||||
const std::string_view matchi(matches[i].first);
|
||||
auto cmp = std::mismatch(match0.begin(), match0.end(), matchi.begin(), matchi.end());
|
||||
len = std::min(len, static_cast<size_t>(cmp.first - match0.begin()));
|
||||
}
|
||||
|
||||
const std::string updated_line = std::string(match0.substr(0, len));
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
std::sort(matches.begin(), matches.end(), [](const auto & a, const auto & b) {
|
||||
return a.first.compare(0, a.second, b.first, 0, b.second) < 0;
|
||||
});
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
// note: make this view implementation generic, so that we can move to TUI in the future if we want to
|
||||
namespace view {
|
||||
static void init(const common_params & params) {
|
||||
// TODO: avoid using atexit() here by making `console` a singleton
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
console::set_completion_callback(auto_completion_callback);
|
||||
}
|
||||
|
||||
struct spinner {
|
||||
spinner(const std::string & message) {
|
||||
if (!message.empty()) {
|
||||
console::log("%s ", message.c_str());
|
||||
}
|
||||
console::spinner::start();
|
||||
}
|
||||
~spinner() {
|
||||
console::spinner::stop();
|
||||
}
|
||||
};
|
||||
|
||||
struct user_turn {
|
||||
user_turn() {
|
||||
console::set_display(DISPLAY_TYPE_USER_INPUT);
|
||||
}
|
||||
~user_turn() {
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
void echo(const std::string & buffer) {
|
||||
if (buffer.size() > 500) {
|
||||
console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
|
||||
} else {
|
||||
console::log("\n> %s\n", buffer.c_str());
|
||||
}
|
||||
}
|
||||
std::string read_input(bool multiline_input, const char * prompt = nullptr) {
|
||||
if (prompt) {
|
||||
console::log("%s", prompt);
|
||||
} else {
|
||||
console::log("\n> ");
|
||||
}
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
return buffer;
|
||||
}
|
||||
};
|
||||
|
||||
enum assistant_display_mode {
|
||||
ASSISTANT_DISPLAY_MODE_REASONING,
|
||||
ASSISTANT_DISPLAY_MODE_CONTENT,
|
||||
};
|
||||
struct assistant_turn {
|
||||
assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT;
|
||||
bool trailing_newline = true;
|
||||
bool is_inside_reasoning = false;
|
||||
assistant_turn() {
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
~assistant_turn() {
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
add_newline_if_needed();
|
||||
}
|
||||
void push(assistant_display_mode m, const std::string & buffer) {
|
||||
if (m != mode) {
|
||||
add_newline_if_needed();
|
||||
switch (m) {
|
||||
case ASSISTANT_DISPLAY_MODE_CONTENT:
|
||||
{
|
||||
if (is_inside_reasoning) {
|
||||
console::log("[End thinking]\n\n");
|
||||
is_inside_reasoning = false;
|
||||
}
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
} break;
|
||||
case ASSISTANT_DISPLAY_MODE_REASONING:
|
||||
{
|
||||
console::set_display(DISPLAY_TYPE_REASONING);
|
||||
is_inside_reasoning = true;
|
||||
console::log("\n[Start thinking]\n\n");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
mode = m;
|
||||
if (buffer.empty()) {
|
||||
return;
|
||||
}
|
||||
trailing_newline = buffer.back() == '\n';
|
||||
console::log("%s", buffer.c_str());
|
||||
console::flush();
|
||||
}
|
||||
void add_newline_if_needed() {
|
||||
if (!trailing_newline) {
|
||||
console::log("\n");
|
||||
console::flush();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void show_error(const std::string & title, const std::string & message = "") {
|
||||
console::spinner::stop();
|
||||
console::error("Error: %s\n", title.c_str());
|
||||
if (!message.empty()) {
|
||||
console::log("%s\n", message.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
static void show_message(const std::string & message) {
|
||||
console::log("%s\n", message.c_str());
|
||||
}
|
||||
|
||||
static void show_info(const std::string & message) {
|
||||
console::set_display(DISPLAY_TYPE_INFO);
|
||||
console::log("%s\n", message.c_str());
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
}
|
||||
+624
-10
@@ -1,10 +1,20 @@
|
||||
#include "arg.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "arg.h"
|
||||
#include "console.h"
|
||||
#include "fit.h"
|
||||
// #include "log.h"
|
||||
|
||||
#include "cli-context.h"
|
||||
#include "cli-view.h"
|
||||
#include "server-common.h"
|
||||
#include "server-context.h"
|
||||
#include "server-task.h"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <signal.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
@@ -15,19 +25,342 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
const char * LLAMA_ASCII_LOGO = R"(
|
||||
▄▄ ▄▄
|
||||
██ ██
|
||||
██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄
|
||||
██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██
|
||||
██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
|
||||
██ ██
|
||||
▀▀ ▀▀
|
||||
)";
|
||||
|
||||
static std::atomic<bool> g_is_interrupted = false;
|
||||
static bool should_stop() {
|
||||
return g_is_interrupted.load();
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void signal_handler(int) {
|
||||
if (g_cli_interrupted.load()) {
|
||||
if (g_is_interrupted.load()) {
|
||||
// second Ctrl+C - exit immediately
|
||||
// make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
|
||||
fprintf(stdout, "\033[0m\n");
|
||||
fflush(stdout);
|
||||
std::exit(130);
|
||||
}
|
||||
g_cli_interrupted.store(true);
|
||||
g_is_interrupted.store(true);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct cli_context {
|
||||
server_context ctx_server;
|
||||
json messages = json::array();
|
||||
std::vector<raw_buffer> input_files;
|
||||
task_params defaults;
|
||||
bool verbose_prompt;
|
||||
|
||||
// thread for showing "loading" animation
|
||||
std::atomic<bool> loading_show;
|
||||
|
||||
cli_context(const common_params & params) {
|
||||
defaults.sampling = params.sampling;
|
||||
defaults.speculative = params.speculative;
|
||||
defaults.n_keep = params.n_keep;
|
||||
defaults.n_predict = params.n_predict;
|
||||
defaults.antiprompt = params.antiprompt;
|
||||
|
||||
defaults.stream = true; // make sure we always use streaming mode
|
||||
defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
|
||||
// defaults.return_progress = true; // TODO: show progress
|
||||
|
||||
verbose_prompt = params.verbose_prompt;
|
||||
}
|
||||
|
||||
std::string generate_completion(result_timings & out_timings) {
|
||||
server_response_reader rd = ctx_server.get_response_reader();
|
||||
auto chat_params = format_chat();
|
||||
{
|
||||
// TODO: reduce some copies here in the future
|
||||
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
|
||||
task.id = rd.get_new_id();
|
||||
task.index = 0;
|
||||
task.params = defaults; // copy
|
||||
task.cli_prompt = chat_params.prompt; // copy
|
||||
task.cli_files = input_files; // copy
|
||||
task.cli = true;
|
||||
|
||||
// chat template settings
|
||||
task.params.chat_parser_params = common_chat_parser_params(chat_params);
|
||||
task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
if (!chat_params.parser.empty()) {
|
||||
task.params.chat_parser_params.parser.load(chat_params.parser);
|
||||
}
|
||||
|
||||
// Copy the preserved tokens into the sampling params
|
||||
const llama_vocab * vocab = llama_model_get_vocab(
|
||||
llama_get_model(ctx_server.get_llama_context()));
|
||||
for (const auto & token : chat_params.preserved_tokens) {
|
||||
auto ids = common_tokenize(vocab, token, false, true);
|
||||
if (ids.size() == 1) {
|
||||
task.params.sampling.preserved_tokens.insert(ids[0]);
|
||||
}
|
||||
}
|
||||
|
||||
// reasoning budget sampler
|
||||
if (!chat_params.thinking_end_tag.empty()) {
|
||||
task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
|
||||
task.params.sampling.generation_prompt = chat_params.generation_prompt;
|
||||
|
||||
if (!chat_params.thinking_start_tag.empty()) {
|
||||
task.params.sampling.reasoning_budget_start =
|
||||
common_tokenize(vocab, chat_params.thinking_start_tag, false, true);
|
||||
}
|
||||
task.params.sampling.reasoning_budget_end =
|
||||
common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
|
||||
task.params.sampling.reasoning_budget_forced =
|
||||
common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
|
||||
}
|
||||
|
||||
rd.post_task({std::move(task)});
|
||||
}
|
||||
|
||||
if (verbose_prompt) {
|
||||
console::set_display(DISPLAY_TYPE_PROMPT);
|
||||
console::log("%s\n\n", chat_params.prompt.c_str());
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
|
||||
// wait for first result
|
||||
console::spinner::start();
|
||||
server_task_result_ptr result = rd.next(should_stop);
|
||||
|
||||
while (true) {
|
||||
auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
|
||||
if (res_partial && res_partial->is_begin) {
|
||||
// this is the "send 200 status to client" signal in streaming mode
|
||||
// skip, do not stop the spinner
|
||||
result = rd.next(should_stop);
|
||||
} else {
|
||||
console::spinner::stop();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::string curr_content;
|
||||
bool is_thinking = false;
|
||||
|
||||
while (result) {
|
||||
if (should_stop()) {
|
||||
break;
|
||||
}
|
||||
if (result->is_error()) {
|
||||
json err_data = result->to_json();
|
||||
if (err_data.contains("message")) {
|
||||
console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
|
||||
} else {
|
||||
console::error("Error: %s\n", err_data.dump().c_str());
|
||||
}
|
||||
return curr_content;
|
||||
}
|
||||
auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
|
||||
if (res_partial) {
|
||||
out_timings = std::move(res_partial->timings);
|
||||
for (const auto & diff : res_partial->oaicompat_msg_diffs) {
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (is_thinking) {
|
||||
console::log("\n[End thinking]\n\n");
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
is_thinking = false;
|
||||
}
|
||||
curr_content += diff.content_delta;
|
||||
console::log("%s", diff.content_delta.c_str());
|
||||
console::flush();
|
||||
}
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
console::set_display(DISPLAY_TYPE_REASONING);
|
||||
if (!is_thinking) {
|
||||
console::log("[Start thinking]\n");
|
||||
}
|
||||
is_thinking = true;
|
||||
console::log("%s", diff.reasoning_content_delta.c_str());
|
||||
console::flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
|
||||
if (res_final) {
|
||||
out_timings = std::move(res_final->timings);
|
||||
break;
|
||||
}
|
||||
result = rd.next(should_stop);
|
||||
}
|
||||
g_is_interrupted.store(false);
|
||||
// server_response_reader automatically cancels pending tasks upon destruction
|
||||
return curr_content;
|
||||
}
|
||||
|
||||
// TODO: support remote files in the future (http, https, etc)
|
||||
std::string load_input_file(const std::string & fname, bool is_media) {
|
||||
std::ifstream file = fs_open_ifstream(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
return "";
|
||||
}
|
||||
if (is_media) {
|
||||
raw_buffer buf;
|
||||
buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
input_files.push_back(std::move(buf));
|
||||
return get_media_marker();
|
||||
} else {
|
||||
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
return content;
|
||||
}
|
||||
}
|
||||
|
||||
common_chat_params format_chat() {
|
||||
auto meta = ctx_server.get_meta();
|
||||
auto & chat_params = meta.chat_params;
|
||||
|
||||
auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = {}; // TODO
|
||||
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = chat_params.use_jinja;
|
||||
inputs.parallel_tool_calls = caps["supports_parallel_tool_calls"];
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
inputs.force_pure_content = chat_params.force_pure_content;
|
||||
inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
|
||||
}
|
||||
};
|
||||
|
||||
// TODO?: Make this reusable, enums, docs
|
||||
static const std::array<std::string_view, 8> cmds = {
|
||||
"/audio ",
|
||||
"/clear",
|
||||
"/exit",
|
||||
"/glob ",
|
||||
"/image ",
|
||||
"/read ",
|
||||
"/regen",
|
||||
"/video ",
|
||||
};
|
||||
|
||||
static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
|
||||
std::vector<std::pair<std::string, size_t>> matches;
|
||||
std::string cmd;
|
||||
|
||||
if (line.length() > 1 && line.front() == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
|
||||
return string_starts_with(line, prefix);
|
||||
})) {
|
||||
auto it = cmds.begin();
|
||||
|
||||
while ((it = std::find_if(it, cmds.end(), [line](std::string_view cmd_line) {
|
||||
return string_starts_with(cmd_line, line);
|
||||
})) != cmds.end()) {
|
||||
matches.emplace_back(*it, it->length());
|
||||
++it;
|
||||
}
|
||||
} else {
|
||||
auto it = std::find_if(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
|
||||
return prefix.back() == ' ' && string_starts_with(line, prefix);
|
||||
});
|
||||
|
||||
if (it != cmds.end()) {
|
||||
cmd = *it;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length()));
|
||||
const std::string path_postfix = std::string(line.substr(cursor_byte_pos));
|
||||
auto cur_dir = std::filesystem::current_path();
|
||||
std::string cur_dir_str = cur_dir.string();
|
||||
std::string expanded_prefix = path_prefix;
|
||||
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(path_prefix, '~')) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
expanded_prefix = home + path_prefix.substr(1);
|
||||
}
|
||||
}
|
||||
if (string_starts_with(expanded_prefix, '/')) {
|
||||
#else
|
||||
if (std::isalpha(expanded_prefix[0]) && expanded_prefix.find(':') == 1) {
|
||||
#endif
|
||||
cur_dir = std::filesystem::path(expanded_prefix).parent_path();
|
||||
cur_dir_str.clear();
|
||||
} else if (!path_prefix.empty()) {
|
||||
cur_dir /= std::filesystem::path(path_prefix).parent_path();
|
||||
}
|
||||
|
||||
std::error_code ec;
|
||||
for (const auto & entry : std::filesystem::directory_iterator(cur_dir, ec)) {
|
||||
if (ec) {
|
||||
break;
|
||||
}
|
||||
if (!entry.exists(ec)) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string path_full = entry.path().string();
|
||||
std::string path_entry = !cur_dir_str.empty() && string_starts_with(path_full, cur_dir_str) ? path_full.substr(cur_dir_str.length() + 1) : path_full;
|
||||
|
||||
if (entry.is_directory(ec)) {
|
||||
path_entry.push_back(std::filesystem::path::preferred_separator);
|
||||
}
|
||||
|
||||
if (expanded_prefix.empty() || string_starts_with(path_entry, expanded_prefix)) {
|
||||
const std::string updated_line = cmd + path_entry;
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (matches.empty()) {
|
||||
const std::string updated_line = cmd + path_prefix;
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
// Add the longest common prefix
|
||||
if (!expanded_prefix.empty() && matches.size() > 1) {
|
||||
const std::string_view match0(matches[0].first);
|
||||
const std::string_view match1(matches[1].first);
|
||||
auto it = std::mismatch(match0.begin(), match0.end(), match1.begin(), match1.end());
|
||||
size_t len = it.first - match0.begin();
|
||||
|
||||
for (size_t i = 2; i < matches.size(); ++i) {
|
||||
const std::string_view matchi(matches[i].first);
|
||||
auto cmp = std::mismatch(match0.begin(), match0.end(), matchi.begin(), matchi.end());
|
||||
len = std::min(len, static_cast<size_t>(cmp.first - match0.begin()));
|
||||
}
|
||||
|
||||
const std::string updated_line = std::string(match0.substr(0, len));
|
||||
matches.emplace_back(updated_line + path_postfix, updated_line.length());
|
||||
}
|
||||
|
||||
std::sort(matches.begin(), matches.end(), [](const auto & a, const auto & b) {
|
||||
return a.first.compare(0, a.second, b.first, 0, b.second) < 0;
|
||||
});
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
|
||||
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
@@ -42,6 +375,25 @@ int llama_cli(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// TODO: maybe support it later?
|
||||
if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
|
||||
console::error("--no-conversation is not supported by llama-cli\n");
|
||||
console::error("please use llama-completion instead\n");
|
||||
}
|
||||
|
||||
// struct that contains llama context and inference
|
||||
cli_context ctx_cli(params);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// TODO: avoid using atexit() here by making `console` a singleton
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
console::set_completion_callback(auto_completion_callback);
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
@@ -56,11 +408,273 @@ int llama_cli(int argc, char ** argv) {
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
cli_context ctx_cli(params);
|
||||
|
||||
if (!ctx_cli.init()) {
|
||||
console::log("\nLoading model... "); // followed by loading animation
|
||||
console::spinner::start();
|
||||
if (!ctx_cli.ctx_server.load_model(params)) {
|
||||
console::spinner::stop();
|
||||
console::error("\nFailed to load the model\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return ctx_cli.run();
|
||||
ctx_cli.defaults.sampling = params.sampling;
|
||||
|
||||
console::spinner::stop();
|
||||
console::log("\n");
|
||||
|
||||
std::thread inference_thread([&ctx_cli]() {
|
||||
ctx_cli.ctx_server.start_loop();
|
||||
});
|
||||
|
||||
auto inf = ctx_cli.ctx_server.get_meta();
|
||||
std::string modalities = "text";
|
||||
if (inf.has_inp_image) {
|
||||
modalities += ", vision";
|
||||
}
|
||||
if (inf.has_inp_audio) {
|
||||
modalities += ", audio";
|
||||
}
|
||||
|
||||
auto add_system_prompt = [&]() {
|
||||
if (!params.system_prompt.empty()) {
|
||||
ctx_cli.messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", params.system_prompt}
|
||||
});
|
||||
}
|
||||
};
|
||||
add_system_prompt();
|
||||
|
||||
console::log("\n");
|
||||
console::log("%s\n", LLAMA_ASCII_LOGO);
|
||||
console::log("build : %s\n", inf.build_info.c_str());
|
||||
console::log("model : %s\n", inf.model_name.c_str());
|
||||
console::log("modalities : %s\n", modalities.c_str());
|
||||
if (!params.system_prompt.empty()) {
|
||||
console::log("using custom system prompt\n");
|
||||
}
|
||||
console::log("\n");
|
||||
console::log("available commands:\n");
|
||||
console::log(" /exit or Ctrl+C stop or exit\n");
|
||||
console::log(" /regen regenerate the last response\n");
|
||||
console::log(" /clear clear the chat history\n");
|
||||
console::log(" /read <file> add a text file\n");
|
||||
console::log(" /glob <pattern> add text files using globbing pattern\n");
|
||||
if (inf.has_inp_image) {
|
||||
console::log(" /image <file> add an image file\n");
|
||||
}
|
||||
if (inf.has_inp_audio) {
|
||||
console::log(" /audio <file> add an audio file\n");
|
||||
}
|
||||
if (inf.has_inp_video) {
|
||||
console::log(" /video <file> add a video file\n");
|
||||
}
|
||||
console::log("\n");
|
||||
|
||||
// interactive loop
|
||||
std::string cur_msg;
|
||||
|
||||
auto add_text_file = [&](const std::string & fname) -> bool {
|
||||
std::string marker = ctx_cli.load_input_file(fname, false);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
return false;
|
||||
}
|
||||
if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
|
||||
cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
|
||||
cur_msg += fname;
|
||||
cur_msg.push_back('\n');
|
||||
} else {
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded text from '%s'\n", fname.c_str());
|
||||
return true;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
std::string buffer;
|
||||
console::set_display(DISPLAY_TYPE_USER_INPUT);
|
||||
if (params.prompt.empty()) {
|
||||
console::log("\n> ");
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
} else {
|
||||
// process input prompt from args
|
||||
for (auto & fname : params.image) {
|
||||
std::string marker = ctx_cli.load_input_file(fname, true);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
break;
|
||||
}
|
||||
console::log("Loaded media from '%s'\n", fname.c_str());
|
||||
cur_msg += marker;
|
||||
}
|
||||
buffer = params.prompt;
|
||||
if (buffer.size() > 500) {
|
||||
console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
|
||||
} else {
|
||||
console::log("\n> %s\n", buffer.c_str());
|
||||
}
|
||||
params.prompt.clear(); // only use it once
|
||||
}
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
console::log("\n");
|
||||
|
||||
if (should_stop()) {
|
||||
g_is_interrupted.store(false);
|
||||
break;
|
||||
}
|
||||
|
||||
// remove trailing newline
|
||||
if (!buffer.empty() &&buffer.back() == '\n') {
|
||||
buffer.pop_back();
|
||||
}
|
||||
|
||||
// skip empty messages
|
||||
if (buffer.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool add_user_msg = true;
|
||||
|
||||
// process commands
|
||||
if (string_starts_with(buffer, "/exit")) {
|
||||
break;
|
||||
} else if (string_starts_with(buffer, "/regen")) {
|
||||
if (ctx_cli.messages.size() >= 2) {
|
||||
size_t last_idx = ctx_cli.messages.size() - 1;
|
||||
ctx_cli.messages.erase(last_idx);
|
||||
add_user_msg = false;
|
||||
} else {
|
||||
console::error("No message to regenerate.\n");
|
||||
continue;
|
||||
}
|
||||
} else if (string_starts_with(buffer, "/clear")) {
|
||||
ctx_cli.messages.clear();
|
||||
add_system_prompt();
|
||||
|
||||
ctx_cli.input_files.clear();
|
||||
console::log("Chat history cleared.\n");
|
||||
continue;
|
||||
} else if (
|
||||
(string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
|
||||
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
|
||||
(string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
|
||||
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
|
||||
std::string fname = string_strip(buffer.substr(7));
|
||||
std::string marker = ctx_cli.load_input_file(fname, true);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
continue;
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded media from '%s'\n", fname.c_str());
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/read ")) {
|
||||
std::string fname = string_strip(buffer.substr(6));
|
||||
add_text_file(fname);
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/glob ")) {
|
||||
std::error_code ec;
|
||||
size_t count = 0;
|
||||
auto curdir = std::filesystem::current_path();
|
||||
std::string pattern = string_strip(buffer.substr(6));
|
||||
std::filesystem::path rel_path;
|
||||
|
||||
auto startglob = pattern.find_first_of("![*?");
|
||||
if (startglob != std::string::npos && startglob != 0) {
|
||||
auto endpath = pattern.substr(0, startglob).find_last_of('/');
|
||||
if (endpath != std::string::npos) {
|
||||
std::string rel_pattern = pattern.substr(0, endpath);
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(rel_pattern, '~')) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
rel_pattern = home + rel_pattern.substr(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rel_path = rel_pattern;
|
||||
pattern.erase(0, endpath + 1);
|
||||
curdir /= rel_path;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
|
||||
std::filesystem::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(pattern, rel)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!add_text_file((rel_path / rel).string())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++count >= FILE_GLOB_MAX_RESULTS) {
|
||||
console::error("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
// not a command
|
||||
cur_msg += buffer;
|
||||
}
|
||||
|
||||
// generate response
|
||||
if (add_user_msg) {
|
||||
ctx_cli.messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", cur_msg}
|
||||
});
|
||||
cur_msg.clear();
|
||||
}
|
||||
result_timings timings;
|
||||
std::string assistant_content = ctx_cli.generate_completion(timings);
|
||||
ctx_cli.messages.push_back({
|
||||
{"role", "assistant"},
|
||||
{"content", assistant_content}
|
||||
});
|
||||
console::log("\n");
|
||||
|
||||
if (params.show_timings) {
|
||||
console::set_display(DISPLAY_TYPE_INFO);
|
||||
console::log("\n");
|
||||
console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
|
||||
if (params.single_turn) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
|
||||
console::log("\nExiting...\n");
|
||||
ctx_cli.ctx_server.terminate();
|
||||
inference_thread.join();
|
||||
|
||||
// bump the log level to display timings
|
||||
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
|
||||
common_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ its output, and holds them against the HF model's scores.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import unicodedata
|
||||
@@ -28,6 +29,12 @@ class ModelSpec:
|
||||
mmproj_arg: str
|
||||
model_default: str
|
||||
mmproj_default: str
|
||||
prompt: str = "Free OCR. "
|
||||
n_predict: int = 512
|
||||
n_ctx: int | None = None
|
||||
# Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that
|
||||
# the HF reference strips in result.md; drop it before scoring to match.
|
||||
strip_grounding: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -63,6 +70,20 @@ MODELS = {
|
||||
model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
|
||||
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
|
||||
),
|
||||
"unlimited": ModelSpec(
|
||||
key="unlimited", label="Unlimited-OCR",
|
||||
model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited",
|
||||
model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf",
|
||||
mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf",
|
||||
# "Free OCR." immediately emits EOS on this checkpoint; the HF reference
|
||||
# (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds.
|
||||
prompt="document parsing.",
|
||||
# Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget
|
||||
# and context to reach the article body the ground truth covers.
|
||||
n_predict=4096,
|
||||
n_ctx=16384,
|
||||
strip_grounding=True,
|
||||
),
|
||||
}
|
||||
|
||||
CASES = [
|
||||
@@ -82,9 +103,26 @@ CASES = [
|
||||
# is one pixel off and lands at ~0.69 instead.
|
||||
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
|
||||
),
|
||||
TestCase(
|
||||
model_key="unlimited", label="single-view scan",
|
||||
image="tools/mtmd/test-1.jpeg",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
# HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
|
||||
# Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
|
||||
hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL)
|
||||
|
||||
|
||||
def strip_grounding(text: str) -> str:
|
||||
"""Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the
|
||||
cleaned result.md the HF reference scores against."""
|
||||
return GROUNDING_TAG_RE.sub("", text)
|
||||
|
||||
|
||||
def arg_dest(flag: str) -> str:
|
||||
return flag.lstrip("-").replace("-", "_")
|
||||
|
||||
@@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float:
|
||||
return CHRF().sentence_score(ocr_out, [expected]).score
|
||||
|
||||
|
||||
def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
"""Run mtmd-cli on the image and return its output."""
|
||||
cmd = [
|
||||
str(bin_path),
|
||||
"-m", str(model_path),
|
||||
"--mmproj", str(mmproj_path),
|
||||
"--image", str(image_path),
|
||||
"-p", "Free OCR. ",
|
||||
"-p", spec.prompt,
|
||||
"--chat-template", "deepseek-ocr",
|
||||
"--temp", "0",
|
||||
"--flash-attn", "off", # match the HF "eager" attention reference
|
||||
"--no-warmup",
|
||||
"-n", "512", # cap loops on hard images (KV would otherwise fill)
|
||||
"-n", str(spec.n_predict), # cap loops on hard images (KV would otherwise fill)
|
||||
# HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
|
||||
# Default DRY breakers include "\n", so they are cleared below.
|
||||
"--dry-multiplier", "0.8",
|
||||
@@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
"--dry-penalty-last-n", "-1",
|
||||
"--dry-sequence-breaker", "none",
|
||||
]
|
||||
if spec.n_ctx is not None:
|
||||
cmd += ["-c", str(spec.n_ctx)]
|
||||
logger.debug(f" command: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
@@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
|
||||
|
||||
output = result.stdout.decode("utf-8", errors="replace").strip()
|
||||
if spec.strip_grounding:
|
||||
output = strip_grounding(output)
|
||||
if not output:
|
||||
raise RuntimeError("llama-mtmd-cli produced no output on stdout")
|
||||
logger.info(f" output: {len(output)} chars")
|
||||
@@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
|
||||
|
||||
logger.info("")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Free OCR evaluation:")
|
||||
logger.info("OCR evaluation:")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})")
|
||||
logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})")
|
||||
@@ -269,9 +311,9 @@ def main() -> int:
|
||||
expected = read_expected_text(ground_truth)
|
||||
logger.info(f" Image: {case.image}")
|
||||
logger.info(f" Expected text: {len(expected)} chars")
|
||||
logger.info(" Running llama.cpp 'Free OCR'")
|
||||
logger.info(f" Running llama.cpp prompt {model_spec.prompt!r}")
|
||||
try:
|
||||
ocr_out = run_mtmd_cli(model, mmproj, image, binary)
|
||||
ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary)
|
||||
except RuntimeError as e:
|
||||
logger.error(f" Error: {e}")
|
||||
results[title] = False
|
||||
|
||||
@@ -40,6 +40,7 @@ struct debug_options {
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
bool parallel_tool_calls = true;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
};
|
||||
@@ -87,6 +88,7 @@ static void print_usage(const char * program_name) {
|
||||
LOG_ERR("\nOptions:\n");
|
||||
LOG_ERR(" --no-tools Disable tool definitions\n");
|
||||
LOG_ERR(" --force-tool-call Set tool calls to forced\n");
|
||||
LOG_ERR(" --parallel-tool-calls=0|1 Set parallel_tool_calls (default: 1)\n");
|
||||
LOG_ERR(" --generation-prompt=0|1 Set add_generation_prompt (default: 1)\n");
|
||||
LOG_ERR(" --enable-reasoning=0|1 Enable reasoning parsing (default: 1)\n");
|
||||
LOG_ERR(" --output=MODE Output mode: analysis, template, both (default: both)\n");
|
||||
@@ -121,6 +123,8 @@ static bool parse_options(int argc, char ** argv, debug_options & opts) {
|
||||
opts.debug_jinja = true;
|
||||
} else if (arg == "--no-tools") {
|
||||
opts.with_tools = false;
|
||||
} else if (arg.rfind("--parallel-tool-calls=", 0) == 0) {
|
||||
opts.parallel_tool_calls = parse_bool_option(arg.substr(22));
|
||||
} else if (arg.rfind("--generation-prompt=", 0) == 0) {
|
||||
opts.generation_prompt = parse_bool_option(arg.substr(20));
|
||||
} else if (arg.rfind("--enable-reasoning=", 0) == 0) {
|
||||
@@ -349,7 +353,7 @@ static autoparser::generation_params prepare_params(const debug_options & opts,
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
params.parallel_tool_calls = opts.parallel_tool_calls;
|
||||
return params;
|
||||
}
|
||||
|
||||
|
||||
@@ -89,7 +89,9 @@ struct server_batch {
|
||||
}
|
||||
|
||||
~server_batch() {
|
||||
llama_batch_free(batch);
|
||||
if (batch.token != nullptr) {
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
}
|
||||
|
||||
void init(int32_t n_tokens_alloc) {
|
||||
@@ -1215,6 +1217,10 @@ private:
|
||||
cparams.ctx_other = ctx_tgt;
|
||||
|
||||
ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
|
||||
if (ctx_dft == nullptr) {
|
||||
SRV_ERR("%s", "failed to create draft context\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
params_base.speculative.draft.ctx_tgt = ctx_tgt;
|
||||
params_base.speculative.draft.ctx_dft = ctx_dft.get();
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#include "build-info.h"
|
||||
#include "preset.h"
|
||||
#include "download.h"
|
||||
#include "http.h"
|
||||
|
||||
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
||||
#include <sheredom/subprocess.h>
|
||||
@@ -26,7 +25,14 @@
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
|
||||
#ifndef _WIN32
|
||||
#ifdef _WIN32
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
extern char **environ;
|
||||
#endif
|
||||
|
||||
@@ -698,6 +704,66 @@ std::optional<server_model_meta> server_models::get_meta(const std::string & nam
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static int get_free_port() {
|
||||
#ifdef _WIN32
|
||||
WSADATA wsaData;
|
||||
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
|
||||
return -1;
|
||||
}
|
||||
typedef SOCKET native_socket_t;
|
||||
#define INVALID_SOCKET_VAL INVALID_SOCKET
|
||||
#define CLOSE_SOCKET(s) closesocket(s)
|
||||
#else
|
||||
typedef int native_socket_t;
|
||||
#define INVALID_SOCKET_VAL -1
|
||||
#define CLOSE_SOCKET(s) close(s)
|
||||
#endif
|
||||
|
||||
native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (sock == INVALID_SOCKET_VAL) {
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct sockaddr_in serv_addr;
|
||||
std::memset(&serv_addr, 0, sizeof(serv_addr));
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
|
||||
serv_addr.sin_port = htons(0);
|
||||
|
||||
if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) {
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
int namelen = sizeof(serv_addr);
|
||||
#else
|
||||
socklen_t namelen = sizeof(serv_addr);
|
||||
#endif
|
||||
if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) {
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
|
||||
int port = ntohs(serv_addr.sin_port);
|
||||
|
||||
CLOSE_SOCKET(sock);
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
|
||||
return port;
|
||||
}
|
||||
|
||||
// helper to convert vector<string> to char **
|
||||
// pointers are only valid as long as the original vector is valid
|
||||
static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
|
||||
@@ -801,7 +867,7 @@ void server_models::load(const std::string & name, const load_options & opts) {
|
||||
// prepare new instance info
|
||||
instance_t inst;
|
||||
inst.meta = meta;
|
||||
inst.meta.port = common_http_get_free_port();
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
inst.meta.loaded_info = json{};
|
||||
inst.meta.last_used = ggml_time_ms();
|
||||
|
||||
+16
-35
@@ -35,19 +35,6 @@ static inline void signal_handler(int signal) {
|
||||
shutdown_handler(signal);
|
||||
}
|
||||
|
||||
// satisfies -Wmissing-declarations (used by llama command)
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
// to be used via CLI (argc / argv are used by router mode only)
|
||||
int llama_server(common_params & params, int argc, char ** argv);
|
||||
void llama_server_terminate();
|
||||
void llama_server_terminate() {
|
||||
if (shutdown_handler) {
|
||||
shutdown_handler(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// wrapper function that handles exceptions and logs errors
|
||||
// this is to make sure handler_t never throws exceptions; instead, it returns an error response
|
||||
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
|
||||
@@ -84,6 +71,9 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
|
||||
};
|
||||
}
|
||||
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
int llama_server(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
@@ -99,14 +89,8 @@ int llama_server(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
return llama_server(params, argc, argv);
|
||||
}
|
||||
|
||||
int llama_server(common_params & params, int argc, char ** argv) {
|
||||
bool is_run_by_cli = (argv == nullptr);
|
||||
|
||||
// note: router mode also accepts -hf remote-preset, so we need to check that first
|
||||
if (!is_run_by_cli && !params.model.hf_repo.empty()) {
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
try {
|
||||
common_params_handle_models_params handle_params;
|
||||
handle_params.preset_only = true;
|
||||
@@ -288,9 +272,8 @@ int llama_server(common_params & params, int argc, char ** argv) {
|
||||
|
||||
if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) {
|
||||
return child.run_download(params);
|
||||
} else if (!is_router_server && !is_run_by_cli) {
|
||||
} else if (!is_router_server) {
|
||||
// single-model mode (NOT spawned by router)
|
||||
// if this is invoked by CLI, model downloading should be already handled
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
|
||||
}
|
||||
|
||||
@@ -373,22 +356,20 @@ int llama_server(common_params & params, int argc, char ** argv) {
|
||||
};
|
||||
}
|
||||
|
||||
// register signal handler if not running by CLI
|
||||
if (!is_run_by_cli) {
|
||||
// TODO: refactor in common/console
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (is_router_server) {
|
||||
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
|
||||
|
||||
+1
-2
@@ -28,10 +28,9 @@ vite.config.ts.timestamp-*
|
||||
# PWA Artifacts
|
||||
apple-splash-*.png
|
||||
apple-touch-icon-*.png
|
||||
favicon.ico
|
||||
favicon-dark.ico
|
||||
maskable-icon-*.png
|
||||
pwa-*.png
|
||||
static/favicon*
|
||||
|
||||
# Storybook
|
||||
*storybook.log
|
||||
|
||||
Generated
+7
-7
@@ -35,7 +35,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"dompurify": "3.4.11",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
@@ -8653,9 +8653,9 @@
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/dompurify": {
|
||||
"version": "3.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.5.tgz",
|
||||
"integrity": "sha512-OrwIBKsdNSVEeubdJ1HBv/wNENRM9ytAVCv7YXt//A3vPdVMNuACRqK9mXCGCBW2ln7BT/A4X0jXHo2Gu89miA==",
|
||||
"version": "3.4.11",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.11.tgz",
|
||||
"integrity": "sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==",
|
||||
"dev": true,
|
||||
"license": "(MPL-2.0 OR Apache-2.0)",
|
||||
"optionalDependencies": {
|
||||
@@ -10226,9 +10226,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/hono": {
|
||||
"version": "4.12.23",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.23.tgz",
|
||||
"integrity": "sha512-eIaZ9qDgu7XV0pxOCrg7/WhnQ6Ivm22UcxhXx/A3dcbqbbYgBEkc6e/J/s7j2tS96zoB0S9VBdLwQNCWwUo4LA==",
|
||||
"version": "4.12.26",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.26.tgz",
|
||||
"integrity": "sha512-uyZtpnYxM9CmQ7QsQknM4zN8EftNqhON1qYeIKM0Se67CCEe2c44xyGURwB0axX2fBDu1dqHrHAc1hmNT8ITkw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
|
||||
@@ -54,7 +54,7 @@
|
||||
"bits-ui": "2.18.1",
|
||||
"clsx": "2.1.1",
|
||||
"dexie": "4.4.3",
|
||||
"dompurify": "3.4.5",
|
||||
"dompurify": "3.4.11",
|
||||
"eslint": "9.39.4",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-storybook": "10.4.2",
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
import { defineConfig } from '@vite-pwa/assets-generator/config';
|
||||
import { FAVICON_COLORS, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
|
||||
import { writeThemeFavicons } from './scripts/favicon-colorize';
|
||||
|
||||
writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
});
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
@@ -7,7 +13,8 @@ export default defineConfig({
|
||||
preset: {
|
||||
transparent: {
|
||||
sizes: [],
|
||||
favicons: [[48, 'favicon-dark.ico']]
|
||||
favicons: [[48, 'favicon-dark.ico']],
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
},
|
||||
maskable: {
|
||||
sizes: []
|
||||
|
||||
@@ -5,15 +5,32 @@ import {
|
||||
} from '@vite-pwa/assets-generator/config';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { resolve } from 'node:path';
|
||||
import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
|
||||
import {
|
||||
THEME_COLORS,
|
||||
PWA_GENERATOR_DEVICES,
|
||||
PWA_ASSET_GENERATOR,
|
||||
FAVICON_COLORS
|
||||
} from './src/lib/constants/pwa';
|
||||
import { SplashOrientation } from './src/lib/enums/splash.enums';
|
||||
import { writeThemeFavicons } from './scripts/favicon-colorize';
|
||||
|
||||
writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
});
|
||||
|
||||
export default defineConfig({
|
||||
headLinkOptions: {
|
||||
preset: PWA_ASSET_GENERATOR.LINK_PRESET
|
||||
},
|
||||
preset: combinePresetAndAppleSplashScreens(
|
||||
minimal2023Preset,
|
||||
{
|
||||
...minimal2023Preset,
|
||||
// tiny margin so favicon.ico / pwa-*.png breathe inside the canvas
|
||||
transparent: {
|
||||
...minimal2023Preset.transparent,
|
||||
padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
|
||||
}
|
||||
},
|
||||
{
|
||||
padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
|
||||
resizeOptions: {
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
||||
import { dirname, resolve } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const HERE = dirname(fileURLToPath(import.meta.url));
|
||||
const PROJECT_ROOT = resolve(HERE, '..');
|
||||
|
||||
const DEFAULT_LOGO = resolve(PROJECT_ROOT, 'src/lib/assets/logo.svg');
|
||||
const DEFAULT_OUT_DIR = resolve(PROJECT_ROOT, 'static');
|
||||
const DEFAULT_OUT_LIGHT = resolve(DEFAULT_OUT_DIR, 'favicon.svg');
|
||||
const DEFAULT_OUT_DARK = resolve(DEFAULT_OUT_DIR, 'favicon-dark.svg');
|
||||
|
||||
const CURRENT_COLOR = 'currentColor';
|
||||
|
||||
export interface ColorizedFavicon {
|
||||
light: string;
|
||||
dark: string;
|
||||
}
|
||||
|
||||
export interface WriteThemeFaviconsOptions {
|
||||
sourcePath?: string;
|
||||
lightOutPath?: string;
|
||||
darkOutPath?: string;
|
||||
/**
|
||||
* Fraction of the icon (0..1) to leave as an even margin on each side.
|
||||
* Applied by wrapping the inner content in a `<g transform="...">` so the
|
||||
* source `src/lib/assets/logo.svg` is not modified. Pass 0 to disable.
|
||||
*/
|
||||
padding?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace every `currentColor` occurrence in the SVG with the given color.
|
||||
* Pure: no filesystem access, so it is straightforward to unit-test.
|
||||
*/
|
||||
export function colorizeFaviconSvg(
|
||||
svg: string,
|
||||
lightColor: string,
|
||||
darkColor: string
|
||||
): ColorizedFavicon {
|
||||
return {
|
||||
light: svg.replaceAll(CURRENT_COLOR, lightColor),
|
||||
dark: svg.replaceAll(CURRENT_COLOR, darkColor)
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Shrink the inner SVG content uniformly and re-center it so `padding` (a
|
||||
* 0..1 fraction) is reserved as equal margin on each side. Returns the input
|
||||
* unchanged for non-positive padding, missing/invalid `viewBox`, or unexpected
|
||||
* markup so the caller always gets a renderable SVG.
|
||||
*/
|
||||
export function padFaviconSvg(svg: string, padding: number): string {
|
||||
if (!(padding > 0) || padding >= 1) return svg;
|
||||
|
||||
const viewBoxMatch = svg.match(/viewBox\s*=\s*["']([^"']+)["']/i);
|
||||
if (!viewBoxMatch) return svg;
|
||||
|
||||
const parts = viewBoxMatch[1]
|
||||
.trim()
|
||||
.split(/[\s,]+/)
|
||||
.map(Number);
|
||||
if (parts.length !== 4 || parts.some((n) => !Number.isFinite(n))) return svg;
|
||||
|
||||
const [, , width, height] = parts;
|
||||
if (width <= 0 || height <= 0) return svg;
|
||||
|
||||
const scale = 1 - padding;
|
||||
const translateX = (padding * width) / 2;
|
||||
const translateY = (padding * height) / 2;
|
||||
|
||||
const openTagStart = svg.search(/<svg\b/i);
|
||||
if (openTagStart === -1) return svg;
|
||||
const openTagEnd = svg.indexOf('>', openTagStart);
|
||||
if (openTagEnd === -1) return svg;
|
||||
const closeStart = svg.lastIndexOf('</svg');
|
||||
if (closeStart === -1 || closeStart <= openTagEnd) return svg;
|
||||
|
||||
const openTag = svg.slice(0, openTagEnd + 1);
|
||||
const inner = svg.slice(openTagEnd + 1, closeStart);
|
||||
const closeTag = svg.slice(closeStart);
|
||||
|
||||
const group = `<g transform="translate(${translateX} ${translateY}) scale(${scale})">`;
|
||||
return `${openTag}${group}${inner}</g>${closeTag}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read `src/lib/assets/logo.svg`, colorize it for both themes, and write
|
||||
* the results to the static directory so the PWA asset generator can consume
|
||||
* them. Paths can be overridden for tests.
|
||||
*/
|
||||
export function writeThemeFavicons(
|
||||
lightColor: string,
|
||||
darkColor: string,
|
||||
{
|
||||
sourcePath = DEFAULT_LOGO,
|
||||
lightOutPath = DEFAULT_OUT_LIGHT,
|
||||
darkOutPath = DEFAULT_OUT_DARK,
|
||||
padding = 0
|
||||
}: WriteThemeFaviconsOptions = {}
|
||||
): void {
|
||||
const source = readFileSync(sourcePath, 'utf-8');
|
||||
const { light, dark } = colorizeFaviconSvg(source, lightColor, darkColor);
|
||||
mkdirSync(dirname(lightOutPath), { recursive: true });
|
||||
writeFileSync(lightOutPath, padFaviconSvg(light, padding));
|
||||
writeFileSync(darkOutPath, padFaviconSvg(dark, padding));
|
||||
}
|
||||
@@ -48,6 +48,7 @@
|
||||
|
||||
--chat-form-area-height: 8rem;
|
||||
--chat-form-area-offset: 2rem;
|
||||
--chat-form-padding-top: 6rem;
|
||||
--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
|
||||
}
|
||||
|
||||
@@ -55,6 +56,7 @@
|
||||
:root {
|
||||
--chat-form-area-height: 24rem;
|
||||
--chat-form-area-offset: 12rem;
|
||||
--chat-form-padding-top: 6rem;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,7 +143,6 @@
|
||||
@apply bg-background text-foreground;
|
||||
scrollbar-width: thin;
|
||||
scrollbar-gutter: stable;
|
||||
overflow: hidden; /* Added due to Mermaid rendering somehow causing the double scrollbar */
|
||||
}
|
||||
|
||||
/* Global scrollbar styling - visible only on hover */
|
||||
@@ -193,3 +194,7 @@
|
||||
scrollbar-width: none;
|
||||
}
|
||||
}
|
||||
|
||||
.mermaidTooltip {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ import { isElementInViewport } from '$lib/utils/viewport';
|
||||
*/
|
||||
export function fadeInView(
|
||||
node: HTMLElement,
|
||||
options: { duration?: number; y?: number; skipIfVisible?: boolean } = {}
|
||||
options: { duration?: number; y?: number; delay?: number; skipIfVisible?: boolean } = {}
|
||||
) {
|
||||
const { duration = 300, y = 0, skipIfVisible = false } = options;
|
||||
const { duration = 300, y = 0, delay = 0, skipIfVisible = false } = options;
|
||||
|
||||
if (skipIfVisible && isElementInViewport(node)) {
|
||||
return;
|
||||
@@ -27,10 +27,12 @@ export function fadeInView(
|
||||
(entries) => {
|
||||
for (const entry of entries) {
|
||||
if (entry.isIntersecting) {
|
||||
requestAnimationFrame(() => {
|
||||
node.style.opacity = '1';
|
||||
node.style.transform = 'translateY(0)';
|
||||
});
|
||||
setTimeout(() => {
|
||||
requestAnimationFrame(() => {
|
||||
node.style.opacity = '1';
|
||||
node.style.transform = 'translateY(0)';
|
||||
});
|
||||
}, delay);
|
||||
observer.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="currentColor"/>
|
||||
<path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="currentColor"/>
|
||||
<path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="currentColor"/>
|
||||
<path d="M304 488H112L202.462 331.2L304 488Z" fill="currentColor"/>
|
||||
<path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="currentColor"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 771 B |
@@ -8,12 +8,13 @@
|
||||
ariaLabel?: string;
|
||||
class?: string;
|
||||
disabled?: boolean;
|
||||
href?: string;
|
||||
icon: Component;
|
||||
iconSize?: string;
|
||||
onclick: (e?: MouseEvent) => void;
|
||||
onclick?: (e?: MouseEvent) => void;
|
||||
size?: ButtonSize;
|
||||
stopPropagationOnClick?: boolean;
|
||||
tooltip: string;
|
||||
tooltip?: string;
|
||||
variant?: ButtonVariant;
|
||||
tooltipSide?: TooltipSide;
|
||||
}
|
||||
@@ -22,6 +23,7 @@
|
||||
icon,
|
||||
tooltip,
|
||||
variant = 'ghost',
|
||||
href = '',
|
||||
size = 'sm',
|
||||
class: className = '',
|
||||
disabled = false,
|
||||
@@ -31,34 +33,49 @@
|
||||
onclick,
|
||||
ariaLabel
|
||||
}: Props = $props();
|
||||
|
||||
let innerWidth = $state(0);
|
||||
const showTooltip = $derived(!!tooltip && innerWidth > 768);
|
||||
</script>
|
||||
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<!-- prevent another nested button element -->
|
||||
{#snippet child({ props })}
|
||||
<Button
|
||||
{...props}
|
||||
{variant}
|
||||
{size}
|
||||
{disabled}
|
||||
onclick={(e: MouseEvent) => {
|
||||
if (stopPropagationOnClick) e.stopPropagation();
|
||||
{#snippet button(props = {})}
|
||||
<Button
|
||||
{...props}
|
||||
{href}
|
||||
{variant}
|
||||
{size}
|
||||
{disabled}
|
||||
onclick={(e: MouseEvent) => {
|
||||
if (stopPropagationOnClick) e.stopPropagation();
|
||||
|
||||
onclick?.(e);
|
||||
}}
|
||||
class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
|
||||
aria-label={ariaLabel || tooltip}
|
||||
>
|
||||
{#if icon}
|
||||
{@const IconComponent = icon}
|
||||
<IconComponent class={iconSize} />
|
||||
{/if}
|
||||
</Button>
|
||||
{/snippet}
|
||||
</Tooltip.Trigger>
|
||||
onclick?.(e);
|
||||
}}
|
||||
class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
|
||||
aria-label={ariaLabel || tooltip}
|
||||
>
|
||||
{#if icon}
|
||||
{@const IconComponent = icon}
|
||||
|
||||
<Tooltip.Content side={tooltipSide}>
|
||||
<p>{tooltip}</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
<IconComponent class={iconSize} />
|
||||
{/if}
|
||||
</Button>
|
||||
{/snippet}
|
||||
|
||||
{#if showTooltip}
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<!-- prevent another nested button element -->
|
||||
{#snippet child({ props })}
|
||||
{@render button(props)}
|
||||
{/snippet}
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side={tooltipSide}>
|
||||
<p>{tooltip}</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{:else}
|
||||
{@render button({ href })}
|
||||
{/if}
|
||||
|
||||
<svelte:window bind:innerWidth />
|
||||
|
||||
@@ -494,7 +494,7 @@
|
||||
/>
|
||||
|
||||
<div
|
||||
class="{INPUT_CLASSES} overflow-hidden rounded-3xl backdrop-blur-md {disabled
|
||||
class="{INPUT_CLASSES} overflow-hidden rounded-4xl md:rounded-3xl backdrop-blur-md {disabled
|
||||
? 'cursor-not-allowed opacity-60'
|
||||
: ''}"
|
||||
data-slot="input-area"
|
||||
@@ -510,7 +510,7 @@
|
||||
/>
|
||||
|
||||
<div
|
||||
class="flex-column relative min-h-[48px] items-center rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:!py-3"
|
||||
class="flex-column relative min-h-12 items-center rounded-4xl md:rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:py-3!"
|
||||
onpaste={handlePaste}
|
||||
>
|
||||
<ChatFormTextarea
|
||||
|
||||
+1
-1
@@ -15,7 +15,7 @@
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger class="w-full">
|
||||
<Button
|
||||
class="file-upload-button h-8 w-8 rounded-full p-0"
|
||||
class="file-upload-button md:h-8 md:w-8 h-9 w-9 rounded-full p-0"
|
||||
{disabled}
|
||||
{onclick}
|
||||
variant="secondary"
|
||||
|
||||
+16
-3
@@ -15,6 +15,7 @@
|
||||
import { McpLogo } from '$lib/components/app';
|
||||
import { PencilRuler, ChevronDown, ChevronRight } from '@lucide/svelte';
|
||||
import { HealthCheckStatus } from '$lib/enums';
|
||||
import { AttachmentAction } from '$lib/enums/attachment.enums';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -270,14 +271,22 @@
|
||||
</Collapsible.Root>
|
||||
{/if}
|
||||
|
||||
<button type="button" class={sheetItemClass} onclick={onSystemPromptClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.SYSTEM_PROMPT_CLICK]()}
|
||||
>
|
||||
<MessageSquare class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>System Message</span>
|
||||
</button>
|
||||
|
||||
{#if hasMcpPromptsSupport}
|
||||
<button type="button" class={sheetItemClass} onclick={onMcpPromptClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_PROMPT_CLICK]()}
|
||||
>
|
||||
<Zap class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>MCP Prompt</span>
|
||||
@@ -285,7 +294,11 @@
|
||||
{/if}
|
||||
|
||||
{#if hasMcpResourcesSupport}
|
||||
<button type="button" class={sheetItemClass} onclick={onMcpResourcesClick}>
|
||||
<button
|
||||
type="button"
|
||||
class={sheetItemClass}
|
||||
onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_RESOURCES_CLICK]()}
|
||||
>
|
||||
<FolderOpen class="h-4 w-4 shrink-0" />
|
||||
|
||||
<span>MCP Resources</span>
|
||||
|
||||
+1
@@ -42,6 +42,7 @@
|
||||
{hasMcpPromptsSupport}
|
||||
{hasMcpResourcesSupport}
|
||||
{onFileUpload}
|
||||
{onSystemPromptClick}
|
||||
{onMcpPromptClick}
|
||||
{onMcpResourcesClick}
|
||||
>
|
||||
|
||||
+1
-1
@@ -20,7 +20,7 @@
|
||||
type="submit"
|
||||
disabled={isDisabled}
|
||||
class={[
|
||||
'h-8 w-8 rounded-full p-0',
|
||||
'md:h-8 md:w-8 h-9 w-9 rounded-full p-0',
|
||||
showErrorState &&
|
||||
'bg-red-400/10 text-red-400 hover:bg-red-400/20 hover:text-red-400 disabled:opacity-100'
|
||||
]}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { autoResizeTextarea } from '$lib/utils';
|
||||
import { onMount } from 'svelte';
|
||||
|
||||
@@ -37,7 +38,9 @@
|
||||
}
|
||||
|
||||
export function focus() {
|
||||
textareaElement?.focus();
|
||||
if (isMobile.current) return;
|
||||
|
||||
textareaElement?.focus({ preventScroll: true });
|
||||
}
|
||||
|
||||
export function resetHeight() {
|
||||
|
||||
@@ -231,7 +231,7 @@
|
||||
editedContent = message.content;
|
||||
}
|
||||
|
||||
textareaElement?.focus();
|
||||
textareaElement?.focus({ preventScroll: true });
|
||||
editedExtras = message.extra ? [...message.extra] : [];
|
||||
editedUploadedFiles = [];
|
||||
|
||||
@@ -324,7 +324,7 @@
|
||||
}
|
||||
</script>
|
||||
|
||||
<div use:fadeInView>
|
||||
<div use:fadeInView class="chat-message">
|
||||
{#if message.role === MessageRole.SYSTEM}
|
||||
<ChatMessageSystem
|
||||
bind:textareaElement
|
||||
|
||||
+72
-5
@@ -180,6 +180,9 @@
|
||||
|
||||
let displayedModel = $derived(message.model ?? null);
|
||||
|
||||
// model being switched to while it loads, so the selector bar tracks it
|
||||
let pendingModel = $state<string | null>(null);
|
||||
|
||||
let isCurrentlyLoading = $derived(isLoading());
|
||||
let isStreaming = $derived(isChatStreaming());
|
||||
let hasNoContent = $derived(!message?.content?.trim());
|
||||
@@ -207,6 +210,42 @@
|
||||
isLastAssistantMessage
|
||||
);
|
||||
|
||||
let assistantEl: HTMLDivElement | undefined = $state();
|
||||
let lastUserMessageHeight = $state(0);
|
||||
let assistantMarginTop = $state(0);
|
||||
|
||||
$effect(() => {
|
||||
if (!assistantEl) return;
|
||||
|
||||
assistantMarginTop = Math.round(parseFloat(getComputedStyle(assistantEl).marginTop));
|
||||
|
||||
const chatMessageEl = assistantEl.closest('.chat-message');
|
||||
const previousChatMessage = chatMessageEl?.previousElementSibling;
|
||||
const userMessageEl = previousChatMessage?.querySelector(
|
||||
'.chat-message-user'
|
||||
) as HTMLElement | null;
|
||||
|
||||
if (!userMessageEl) {
|
||||
lastUserMessageHeight = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
const updateHeight = () => {
|
||||
const rect = userMessageEl.getBoundingClientRect();
|
||||
const marginTop = Math.round(parseFloat(getComputedStyle(userMessageEl).marginTop));
|
||||
lastUserMessageHeight = Math.round(rect.height + marginTop);
|
||||
};
|
||||
|
||||
updateHeight();
|
||||
|
||||
const resizeObserver = new ResizeObserver(updateHeight);
|
||||
resizeObserver.observe(userMessageEl);
|
||||
|
||||
return () => {
|
||||
resizeObserver.disconnect();
|
||||
};
|
||||
});
|
||||
|
||||
function handleCopyModel() {
|
||||
void copyToClipboard(displayedModel ?? '');
|
||||
}
|
||||
@@ -219,12 +258,17 @@
|
||||
</script>
|
||||
|
||||
<div
|
||||
class="text-md group w-full leading-7.5 {className}"
|
||||
bind:this={assistantEl}
|
||||
class="chat-message-assistant text-md group w-full leading-7.5 {className}"
|
||||
style:--last-user-message-height={lastUserMessageHeight > 0
|
||||
? `${lastUserMessageHeight}px`
|
||||
: undefined}
|
||||
style:--assistant-margin-top={assistantMarginTop > 0 ? `${assistantMarginTop}px` : undefined}
|
||||
role="group"
|
||||
aria-label="Assistant message with actions"
|
||||
>
|
||||
{#if showProcessingInfoTop}
|
||||
<div class="mt-6 w-full max-w-[48rem]" in:fade>
|
||||
<div class="mt-6 w-full max-w-3xl" in:fade>
|
||||
<div class="processing-container">
|
||||
<span class="processing-text">
|
||||
{modelLoadingText ??
|
||||
@@ -257,7 +301,7 @@
|
||||
{/if}
|
||||
|
||||
{#if showProcessingInfoBottom}
|
||||
<div class="mt-4 w-full max-w-[48rem]" in:fade>
|
||||
<div class="mt-4 w-full max-w-3xl" in:fade>
|
||||
<div class="processing-container">
|
||||
<span class="processing-text">
|
||||
{modelLoadingText ??
|
||||
@@ -277,13 +321,19 @@
|
||||
>
|
||||
{#if isRouter}
|
||||
<ModelsSelectorDropdown
|
||||
currentModel={displayedModel}
|
||||
currentModel={pendingModel ?? displayedModel}
|
||||
disabled={isLoading()}
|
||||
onModelChange={async (modelId: string, modelName: string) => {
|
||||
const status = modelsStore.getModelStatus(modelId);
|
||||
|
||||
if (status !== ServerModelStatus.LOADED) {
|
||||
await modelsStore.loadModel(modelId);
|
||||
pendingModel = modelId;
|
||||
|
||||
try {
|
||||
await modelsStore.loadModel(modelId);
|
||||
} finally {
|
||||
pendingModel = null;
|
||||
}
|
||||
}
|
||||
|
||||
onRegenerate(modelName);
|
||||
@@ -351,6 +401,23 @@
|
||||
</div>
|
||||
|
||||
<style>
|
||||
:global(.chat-message):last-child .chat-message-assistant {
|
||||
--assistant-min-height-offset: calc(
|
||||
var(--last-user-message-height, 19rem) + var(--chat-form-height, 6rem) +
|
||||
var(--chat-form-bottom-position, 0.5rem) + var(--chat-form-padding-top, 6rem) +
|
||||
var(--assistant-margin-top, 3rem)
|
||||
);
|
||||
min-height: calc(100dvh - var(--assistant-min-height-offset));
|
||||
|
||||
@media (width > 768px) {
|
||||
--assistant-min-height-offset: calc(
|
||||
var(--last-user-message-height, 18rem) + var(--chat-form-height, 6rem) +
|
||||
var(--chat-form-bottom-position, 1rem) + var(--chat-form-padding-top, 6rem) +
|
||||
var(--assistant-margin-top, 3rem)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
.processing-container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
+1
-1
@@ -48,7 +48,7 @@
|
||||
|
||||
<div
|
||||
aria-label="User message with actions"
|
||||
class="group flex flex-col items-end gap-3 md:gap-2 {className}"
|
||||
class="chat-message-user group flex flex-col items-end gap-3 md:gap-2 {className}"
|
||||
role="group"
|
||||
>
|
||||
{#if editCtx.isEditing}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user