Compare commits

...

8 Commits

Author SHA1 Message Date
Jeffrey Morgan 4ba9d711ba metal: add neg operator (#13029) 2025-04-20 08:28:40 +03:00
bandoti 00137157fc Disable CI cross-compile builds (#13022) 2025-04-19 18:05:03 +02:00
Sigbjørn Skjæret fb28f4f80e gguf-py : fix upload python package workflow (#13020) 2025-04-19 16:26:38 +02:00
Xuan-Son Nguyen 37b9f0d29d clip : refactor, add image_manipulation and llava_uhd classes (#13011)
* clip : refactor, add `image_manipulation` and `llava_uhd`

* refactor llava-1.6 preprocessing

* simplify logic for llava-1.5

* missing include
2025-04-19 09:15:45 +02:00
Daniel Tang 6408210082 main : Fix Ctrl+D/newline handling (#12951)
This restores the behavior from #491. This does not affect Ctrl+D's ability to
terminate --multiline-input lines (#1040).

This also actually implements #587: "If the user wants the text to end in a
newline, this should be accomplished by explicitly adding a newline by using
\ followed by return, then returning control by pressing return again."

Fixes #12949
2025-04-18 22:02:55 +02:00
Chris Thompson aff9d107b0 gguf-py : GGUF Editor GUI - Python + Qt6 (#12930) 2025-04-18 20:30:41 +02:00
Xuan-Son Nguyen 35370ba945 server : use std::move whenever possible (#12936)
* server : use std::move whenever possible

* use r-value ref

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* make task creation scoped

* restore std::move

* fix task_id not set correctly

* apply changes from suggestion

Co-authored-by: ggerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-04-18 19:58:12 +02:00
Akarshan Biswas 8d66005763 SYCL: Refactor and enable FP16 in binary broadcast OPs (#12975)
* SYCL: refactor move to a separate file

* Fix binbcast

* Remove duplicates

* fix include formatting

* fix typo
2025-04-18 15:57:56 +02:00
19 changed files with 2659 additions and 951 deletions
+3 -2
View File
@@ -601,8 +601,9 @@ jobs:
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
# Disabled for now due to sporadic issue syncing.
# build-linux-cross:
# uses: ./.github/workflows/build-linux-cross.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
+464 -458
View File
File diff suppressed because it is too large Load Diff
+16 -5
View File
@@ -865,9 +865,22 @@ int main(int argc, char ** argv) {
console::set_display(console::reset);
display = true;
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
if (buffer.empty()) { // Ctrl+D on empty line exits
LOG("EOF by user\n");
break;
}
if (buffer.back() == '\n') {
// Implement #587:
// If the user wants the text to end in a newline,
// this should be accomplished by explicitly adding a newline by using \ followed by return,
// then returning control by pressing return again.
buffer.pop_back();
}
if (buffer.empty()) { // Enter key on empty line lets the user pass control back
LOG_DBG("empty line, passing control back\n");
} else { // Add tokens to embd only if the input buffer is non-empty
// append input suffix if any
if (!params.input_suffix.empty() && !params.conversation_mode) {
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
@@ -915,8 +928,6 @@ int main(int argc, char ** argv) {
n_remain -= line_inp.size();
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
+132 -113
View File
@@ -1552,29 +1552,30 @@ struct server_queue {
std::condition_variable condition_tasks;
// callback functions
std::function<void(server_task)> callback_new_task;
std::function<void(void)> callback_update_slots;
std::function<void(server_task &&)> callback_new_task;
std::function<void(void)> callback_update_slots;
// Add a new task to the end of the queue
int post(server_task task, bool front = false) {
int post(server_task && task, bool front = false) {
std::unique_lock<std::mutex> lock(mutex_tasks);
GGML_ASSERT(task.id != -1);
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
const int task_id = task.id;
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
if (front) {
queue_tasks.push_front(std::move(task));
} else {
queue_tasks.push_back(std::move(task));
}
condition_tasks.notify_one();
return task.id;
return task_id;
}
// multi-task version of post()
int post(std::vector<server_task> & tasks, bool front = false) {
int post(std::vector<server_task> && tasks, bool front = false) {
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : tasks) {
if (task.id == -1) {
@@ -1596,7 +1597,7 @@ struct server_queue {
}
// Add a new task, but defer until one slot is available
void defer(server_task task) {
void defer(server_task && task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
QUE_DBG("defer task, id = %d\n", task.id);
queue_tasks_deferred.push_back(std::move(task));
@@ -1611,7 +1612,7 @@ struct server_queue {
}
// Register function to process a new task
void on_new_task(std::function<void(server_task)> callback) {
void on_new_task(std::function<void(server_task &&)> callback) {
callback_new_task = std::move(callback);
}
@@ -1660,7 +1661,7 @@ struct server_queue {
lock.unlock();
break;
}
server_task task = queue_tasks.front();
server_task task = std::move(queue_tasks.front());
queue_tasks.pop_front();
lock.unlock();
@@ -2004,7 +2005,7 @@ struct server_context {
slot.reset();
slots.push_back(slot);
slots.push_back(std::move(slot));
}
default_generation_settings_for_props = slots[0].to_json();
@@ -2105,7 +2106,7 @@ struct server_context {
return true;
}
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
bool launch_slot_with_task(server_slot & slot, server_task && task) {
slot.reset();
slot.id_task = task.id;
slot.index = task.index;
@@ -2113,10 +2114,10 @@ struct server_context {
slot.params = std::move(task.params);
slot.prompt_tokens = std::move(task.prompt_tokens);
if (!are_lora_equal(task.params.lora, slot.lora)) {
if (!are_lora_equal(slot.params.lora, slot.lora)) {
// if lora is changed, we cannot reuse cached tokens
slot.cache_tokens.clear();
slot.lora = task.params.lora;
slot.lora = slot.params.lora;
}
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
@@ -2547,10 +2548,10 @@ struct server_context {
server_task task(SERVER_TASK_TYPE_CANCEL);
task.id_target = id_task;
queue_results.remove_waiting_task_id(id_task);
cancel_tasks.push_back(task);
cancel_tasks.push_back(std::move(task));
}
// push to beginning of the queue, so it has highest priority
queue_tasks.post(cancel_tasks, true);
queue_tasks.post(std::move(cancel_tasks), true);
}
// receive the results from task(s)
@@ -2637,7 +2638,7 @@ struct server_context {
// Functions to process the task
//
void process_single_task(server_task task) {
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
@@ -2651,17 +2652,17 @@ struct server_context {
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
queue_tasks.defer(std::move(task));
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
queue_tasks.defer(std::move(task));
break;
}
if (!launch_slot_with_task(*slot, task)) {
if (!launch_slot_with_task(*slot, std::move(task))) {
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
break;
}
@@ -2740,7 +2741,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
queue_tasks.defer(std::move(task));
break;
}
@@ -2776,7 +2777,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
queue_tasks.defer(std::move(task));
break;
}
@@ -2819,7 +2820,7 @@ struct server_context {
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
queue_tasks.defer(std::move(task));
break;
}
@@ -2871,7 +2872,7 @@ struct server_context {
server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
task.id = queue_tasks.get_new_id();
queue_tasks.post(task);
queue_tasks.post(std::move(task));
}
// apply context-shift if needed
@@ -3633,14 +3634,17 @@ int main(int argc, char ** argv) {
}
// request slots data using task queue
server_task task(SERVER_TASK_TYPE_METRICS);
task.id = ctx_server.queue_tasks.get_new_id();
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task, true); // high-priority task
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_METRICS);
task.id = task_id;
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
}
// get the result
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -3669,16 +3673,17 @@ int main(int argc, char ** argv) {
}
// request slots data using task queue
server_task task(SERVER_TASK_TYPE_METRICS);
task.id = ctx_server.queue_tasks.get_new_id();
task.metrics_reset_bucket = true;
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task, true); // high-priority task
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_METRICS);
task.id = task_id;
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
}
// get the result
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -3775,17 +3780,20 @@ int main(int argc, char ** argv) {
}
std::string filepath = params.slot_save_path + filename;
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
task.id = ctx_server.queue_tasks.get_new_id();
task.slot_action.slot_id = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
task.id = task_id;
task.slot_action.slot_id = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task));
}
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -3804,17 +3812,20 @@ int main(int argc, char ** argv) {
}
std::string filepath = params.slot_save_path + filename;
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
task.id = ctx_server.queue_tasks.get_new_id();
task.slot_action.slot_id = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
task.id = task_id;
task.slot_action.slot_id = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task));
}
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -3826,15 +3837,18 @@ int main(int argc, char ** argv) {
};
const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
task.id = ctx_server.queue_tasks.get_new_id();
task.slot_action.slot_id = id_slot;
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
task.id = task_id;
task.slot_action.slot_id = id_slot;
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task));
}
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -3938,9 +3952,10 @@ int main(int argc, char ** argv) {
}
auto completion_id = gen_chatcmplid();
std::vector<server_task> tasks;
std::unordered_set<int> task_ids;
try {
std::vector<server_task> tasks;
const auto & prompt = data.at("prompt");
// TODO: this log can become very long, put it behind a flag or think about a more compact format
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
@@ -3955,9 +3970,9 @@ int main(int argc, char ** argv) {
task.prompt_tokens = std::move(tokenized_prompts[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server.ctx,
ctx_server.params_base,
data);
ctx_server.ctx,
ctx_server.params_base,
data);
task.id_selected_slot = json_value(data, "id_slot", -1);
// OAI-compat
@@ -3965,18 +3980,18 @@ int main(int argc, char ** argv) {
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl
tasks.push_back(task);
tasks.push_back(std::move(task));
}
task_ids = server_task::get_list_id(tasks);
ctx_server.queue_results.add_waiting_tasks(tasks);
ctx_server.queue_tasks.post(std::move(tasks));
} catch (const std::exception & e) {
res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
return;
}
ctx_server.queue_results.add_waiting_tasks(tasks);
ctx_server.queue_tasks.post(tasks);
bool stream = json_value(data, "stream", false);
const auto task_ids = server_task::get_list_id(tasks);
if (!stream) {
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
@@ -4268,6 +4283,7 @@ int main(int argc, char ** argv) {
// create and queue the task
json responses = json::array();
bool error = false;
std::unordered_set<int> task_ids;
{
std::vector<server_task> tasks;
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4280,28 +4296,27 @@ int main(int argc, char ** argv) {
// OAI-compat
task.params.oaicompat = oaicompat;
tasks.push_back(task);
tasks.push_back(std::move(task));
}
task_ids = server_task::get_list_id(tasks);
ctx_server.queue_results.add_waiting_tasks(tasks);
ctx_server.queue_tasks.post(tasks);
// get the result
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) {
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
responses.push_back(res->to_json());
}
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
ctx_server.queue_tasks.post(std::move(tasks));
}
// get the result
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) {
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
responses.push_back(res->to_json());
}
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
if (error) {
return;
}
@@ -4367,6 +4382,7 @@ int main(int argc, char ** argv) {
// create and queue the task
json responses = json::array();
bool error = false;
std::unordered_set<int> task_ids;
{
std::vector<server_task> tasks;
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
@@ -4376,26 +4392,24 @@ int main(int argc, char ** argv) {
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
tasks.push_back(task);
tasks.push_back(std::move(task));
}
task_ids = server_task::get_list_id(tasks);
ctx_server.queue_results.add_waiting_tasks(tasks);
ctx_server.queue_tasks.post(tasks);
// get the result
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) {
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
responses.push_back(res->to_json());
}
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
ctx_server.queue_tasks.post(std::move(tasks));
}
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) {
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
responses.push_back(res->to_json());
}
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
if (error) {
return;
}
@@ -4431,14 +4445,19 @@ int main(int argc, char ** argv) {
res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
return;
}
server_task task(SERVER_TASK_TYPE_SET_LORA);
task.id = ctx_server.queue_tasks.get_new_id();
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
int task_id = ctx_server.queue_tasks.get_new_id();
{
server_task task(SERVER_TASK_TYPE_SET_LORA);
task.id = task_id;
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task));
}
// get the result
server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);
if (result->is_error()) {
res_error(res, result->to_json());
@@ -4582,8 +4601,8 @@ int main(int argc, char ** argv) {
common_chat_templates_source(ctx_server.chat_templates.get()),
common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
ctx_server.process_single_task(task);
ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
ctx_server.process_single_task(std::move(task));
});
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
+15
View File
@@ -481,6 +481,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_SQRT,
GGML_METAL_KERNEL_TYPE_SIN,
GGML_METAL_KERNEL_TYPE_COS,
GGML_METAL_KERNEL_TYPE_NEG,
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
@@ -1159,6 +1160,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
@@ -1320,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_NEG:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
default:
return false;
@@ -2010,6 +2013,18 @@ static void ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_NEG:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
default:
{
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+7
View File
@@ -949,6 +949,13 @@ kernel void kernel_cos(
dst[tpig] = cos(src0[tpig]);
}
kernel void kernel_neg(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = -src0[tpig];
}
kernel void kernel_sum_rows(
device const float * src0,
device float * dst,
+1
View File
@@ -13,6 +13,7 @@
#ifndef GGML_SYCL_BACKEND_HPP
#define GGML_SYCL_BACKEND_HPP
#include "binbcast.hpp"
#include "concat.hpp"
#include "common.hpp"
#include "conv.hpp"
+350
View File
@@ -0,0 +1,350 @@
#include "binbcast.hpp"
#include <cstddef>
#include <cstdint>
#include <sycl/sycl.hpp>
#include "ggml.h"
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1));
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) /
ne3;
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) %
ne3;
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
for (int i0 = i0s; i0 < ne0;
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
}
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i3 = i/(ne2*ne1*ne0);
const int i2 = (i/(ne1*ne0)) % ne2;
const int i1 = (i/ne0) % ne1;
const int i0 = i % ne0;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
template<float (*bin_op)(const float, const float)>
struct bin_bcast_sycl {
template <typename src0_t, typename src1_t, typename dst_t>
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2,
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
int nr0 = ne10 / ne0;
int nr1 = ne11/ne1;
int nr2 = ne12/ne2;
int nr3 = ne13/ne3;
int nr[4] = { nr0, nr1, nr2, nr3 };
// collapse dimensions until first broadcast dimension
int64_t cne[] = {ne0, ne1, ne2, ne3};
int64_t cne0[] = {ne00, ne01, ne02, ne03};
int64_t cne1[] = {ne10, ne11, ne12, ne13};
size_t cnb[] = {nb0, nb1, nb2, nb3};
size_t cnb0[] = {nb00, nb01, nb02, nb03};
size_t cnb1[] = {nb10, nb11, nb12, nb13};
auto collapse = [](int64_t cne[]) {
cne[0] *= cne[1];
cne[1] = cne[2];
cne[2] = cne[3];
cne[3] = 1;
};
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
cnb[1] *= cne[1];
cnb[2] *= cne[2];
cnb[3] *= cne[3];
};
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
}
if (i > 0) {
collapse_nb(cnb, cne);
collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1);
collapse(cne);
collapse(cne0);
collapse(cne1);
}
}
}
{
int64_t ne0 = cne[0];
int64_t ne1 = cne[1];
int64_t ne2 = cne[2];
int64_t ne3 = cne[3];
int64_t ne10 = cne1[0];
int64_t ne11 = cne1[1];
int64_t ne12 = cne1[2];
int64_t ne13 = cne1[3];
size_t nb0 = cnb[0];
size_t nb1 = cnb[1];
size_t nb2 = cnb[2];
size_t nb3 = cnb[3];
size_t nb00 = cnb0[0];
size_t nb01 = cnb0[1];
size_t nb02 = cnb0[2];
size_t nb03 = cnb0[3];
size_t nb10 = cnb1[0];
size_t nb11 = cnb1[1];
size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3];
size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
size_t s10 = nb10 / sizeof(src1_t);
size_t s11 = nb11 / sizeof(src1_t);
size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t);
size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
GGML_UNUSED(s00);
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1);
GGML_ASSERT(s10 == 1);
const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL);
sycl::range<3> block_dims(1, 1, 1);
block_dims[2] = std::min<unsigned int>(hne0, block_size);
block_dims[1] = std::min<unsigned int>(
ne1, block_size / (unsigned int)block_dims[2]);
block_dims[0] = std::min(
std::min<unsigned int>(
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
(unsigned int)block_dims[1]),
64U);
sycl::range<3> block_nums(
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
(ne1 + block_dims[1] - 1) / block_dims[1],
(hne0 + block_dims[2] - 1) / block_dims[2]);
if (block_nums[0] > 65535) {
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
sycl::range<3>(1, 1, block_size),
sycl::range<3>(1, 1, block_size)),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast_unravel<bin_op>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
s03, s11, s12, s13, item_ct1);
});
}
} else {
/*
DPCT1049:16: The work-group size passed to the SYCL kernel may
exceed the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if
needed.
*/
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
ne2, ne3, ne10, ne11, ne12, ne13,
s1, s2, s3, s01, s02, s03, s11, s12, s13,
item_ct1);
});
}
}
}
};
template <class op>
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
ggml_tensor * dst) {
dpct::queue_ptr main_stream = ctx.stream();
GGML_TENSOR_BINARY_OP_LOCALS
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ABORT("fatal error");
}
}
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
}
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_add(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_sub(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_mul(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_div(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_repeat(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
+39
View File
@@ -0,0 +1,39 @@
#ifndef GGML_SYCL_BINBCAST_HPP
#define GGML_SYCL_BINBCAST_HPP
#include "common.hpp"
static __dpct_inline__ float op_repeat(const float a, const float b) {
return b;
GGML_UNUSED(a);
}
static __dpct_inline__ float op_add(const float a, const float b) {
return a + b;
}
static __dpct_inline__ float op_sub(const float a, const float b) {
return a - b;
}
static __dpct_inline__ float op_mul(const float a, const float b) {
return a * b;
}
static __dpct_inline__ float op_div(const float a, const float b) {
return a / b;
}
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif //GGML_SYCL_BINBCAST_HPP
-281
View File
@@ -494,286 +494,5 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1));
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) /
ne3;
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) %
ne3;
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
for (int i0 = i0s; i0 < ne0;
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
}
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i3 = i/(ne2*ne1*ne0);
const int i2 = (i/(ne1*ne0)) % ne2;
const int i1 = (i/ne0) % ne1;
const int i0 = i % ne0;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
template<float (*bin_op)(const float, const float)>
struct bin_bcast_sycl {
template <typename src0_t, typename src1_t, typename dst_t>
void operator()(ggml_backend_sycl_context & ctx,
const struct ggml_tensor *src0,
const struct ggml_tensor *src1, struct ggml_tensor *dst,
const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
queue_ptr stream) {
GGML_TENSOR_BINARY_OP_LOCALS
int nr0 = ne10/ne0;
int nr1 = ne11/ne1;
int nr2 = ne12/ne2;
int nr3 = ne13/ne3;
int nr[4] = { nr0, nr1, nr2, nr3 };
// collapse dimensions until first broadcast dimension
int64_t cne[] = {ne0, ne1, ne2, ne3};
int64_t cne0[] = {ne00, ne01, ne02, ne03};
int64_t cne1[] = {ne10, ne11, ne12, ne13};
size_t cnb[] = {nb0, nb1, nb2, nb3};
size_t cnb0[] = {nb00, nb01, nb02, nb03};
size_t cnb1[] = {nb10, nb11, nb12, nb13};
auto collapse = [](int64_t cne[]) {
cne[0] *= cne[1];
cne[1] = cne[2];
cne[2] = cne[3];
cne[3] = 1;
};
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
cnb[1] *= cne[1];
cnb[2] *= cne[2];
cnb[3] *= cne[3];
};
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
}
if (i > 0) {
collapse_nb(cnb, cne);
collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1);
collapse(cne);
collapse(cne0);
collapse(cne1);
}
}
}
{
int64_t ne0 = cne[0];
int64_t ne1 = cne[1];
int64_t ne2 = cne[2];
int64_t ne3 = cne[3];
int64_t ne10 = cne1[0];
int64_t ne11 = cne1[1];
int64_t ne12 = cne1[2];
int64_t ne13 = cne1[3];
size_t nb0 = cnb[0];
size_t nb1 = cnb[1];
size_t nb2 = cnb[2];
size_t nb3 = cnb[3];
size_t nb00 = cnb0[0];
size_t nb01 = cnb0[1];
size_t nb02 = cnb0[2];
size_t nb03 = cnb0[3];
size_t nb10 = cnb1[0];
size_t nb11 = cnb1[1];
size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3];
size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
size_t s10 = nb10 / sizeof(src1_t);
size_t s11 = nb11 / sizeof(src1_t);
size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t);
size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
GGML_UNUSED(s00);
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1);
GGML_ASSERT(s10 == 1);
const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL);
sycl::range<3> block_dims(1, 1, 1);
block_dims[2] = std::min<unsigned int>(hne0, block_size);
block_dims[1] = std::min<unsigned int>(
ne1, block_size / (unsigned int)block_dims[2]);
block_dims[0] = std::min(
std::min<unsigned int>(
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
(unsigned int)block_dims[1]),
64U);
sycl::range<3> block_nums(
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
(ne1 + block_dims[1] - 1) / block_dims[1],
(hne0 + block_dims[2] - 1) / block_dims[2]);
if (block_nums[0] > 65535) {
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
sycl::range<3>(1, 1, block_size),
sycl::range<3>(1, 1, block_size)),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast_unravel<bin_op>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
s03, s11, s12, s13, item_ct1);
});
}
} else {
/*
DPCT1049:16: The work-group size passed to the SYCL kernel may
exceed the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if
needed.
*/
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
ne2, ne3, ne10, ne11, ne12, ne13,
s1, s2, s3, s01, s02, s03, s11, s12, s13,
item_ct1);
});
}
}
GGML_UNUSED(ctx);
}
};
template <class op>
inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst) {
dpct::queue_ptr main_stream = ctx.stream();
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
op()(ctx, src0, src1, dst, (const float *)src0->data, (const float *)src1->data, (float *)dst->data, main_stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data,
(sycl::half *)dst->data, main_stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data, (float *)dst->data,
main_stream);
} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
op()(ctx, src0, src1, dst, (const int32_t *)src0->data, (const int32_t *)src1->data, (int32_t *)dst->data,
main_stream);
} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
op()(ctx, src0, src1, dst, (const int16_t *)src0->data, (const int16_t *)src1->data, (int16_t *)dst->data,
main_stream);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ABORT("fatal error");
}
}
bool gpu_has_xmx(sycl::device &dev);
#endif // GGML_SYCL_COMMON_HPP
-47
View File
@@ -1261,27 +1261,6 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
}
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
}
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
}
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
ggml_sycl_op_sqrt(ctx, dst);
@@ -1409,29 +1388,3 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_add(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_sub(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_mul(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_div(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
+1 -31
View File
@@ -10,27 +10,6 @@ T neg_infinity() {
return -std::numeric_limits<T>::infinity();
}
static __dpct_inline__ float op_repeat(const float a, const float b) {
return b;
GGML_UNUSED(a);
}
static __dpct_inline__ float op_add(const float a, const float b) {
return a + b;
}
static __dpct_inline__ float op_sub(const float a, const float b) {
return a - b;
}
static __dpct_inline__ float op_mul(const float a, const float b) {
return a * b;
}
static __dpct_inline__ float op_div(const float a, const float b) {
return a / b;
}
template<typename T>
struct typed_data {
const T * src;
@@ -87,14 +66,5 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
// ---------
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_ELEMENTWISE_HPP
+2 -13
View File
@@ -1967,11 +1967,6 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
}
inline void ggml_sycl_op_mul_mat_sycl(
ggml_backend_sycl_context & ctx,
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
@@ -2600,12 +2595,6 @@ catch (sycl::exception const &exc) {
}
static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_repeat(ctx, dst);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_get_rows(ctx, dst);
@@ -3972,7 +3961,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_ARGMAX:
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_REPEAT:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
@@ -3982,7 +3970,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
return (op->src[0]->type == GGML_TYPE_F32);
case GGML_OP_REPEAT:
return true;
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
+7
View File
@@ -11,6 +11,11 @@ as an example for its usage.
pip install gguf
```
Optionally, you can install gguf with the extra 'gui' to enable the visual GGUF editor.
```sh
pip install gguf[gui]
```
## API Examples/Simple Tools
[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
@@ -25,6 +30,8 @@ pip install gguf
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
[gguf/scripts/gguf_editor_gui.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_editor_gui.py) — Allows for viewing, editing, adding, or removing metadata values within a GGUF file as well as viewing its tensors with a Qt interface.
## Development
Maintainers who participate in development of this package are advised to install it in editable mode:
+1
View File
@@ -4,3 +4,4 @@ from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
from .gguf_dump import main as gguf_dump_entrypoint
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
File diff suppressed because it is too large Load Diff
+6 -1
View File
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.16.0"
version = "0.16.2"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
@@ -23,10 +23,14 @@ numpy = ">=1.17"
tqdm = ">=4.27"
pyyaml = ">=5.1"
sentencepiece = ">=0.1.98,<=0.2.0"
PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[tool.poetry.extras]
gui = ["PySide6"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
@@ -36,3 +40,4 @@ gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
+2
View File
@@ -11,3 +11,5 @@
-r ./requirements-convert_legacy_llama.txt
-r ./requirements-convert_llama_ggml_to_gguf.txt
-r ./requirements-tool_bench.txt
-r ./requirements-gguf_editor_gui.txt
@@ -0,0 +1,3 @@
numpy~=1.26.4
PySide6~=6.9.0
gguf>=0.16.0