Compare commits

..

8 Commits

Author SHA1 Message Date
Georgi Gerganov 695ad752b2 metal : improve clarity (minor) (#10171) 2024-11-08 18:37:41 +02:00
Georgi Gerganov 841f27abdb metal : optimize FA kernels (#10171)
* ggml : add ggml_flash_attn_ext_get_prec

* metal : use F16 precision in FA kernels

ggml-ci

* metal : minor clean-up

* metal : compile-guard bf16 FA kernels

ggml-ci

* build : remove obsolete compile flag [no ci]

* metal : prevent int overflows [no ci]

* cuda : disable BF16 FA

ggml-ci

* metal : fix BF16 requirement for FA kernels

ggml-ci

* make : clean-up [no ci]
2024-11-08 13:47:22 +02:00
Jhen-Jie Hong d05b3127bd swift : exclude ggml-metal-embed.metal (#10211)
* llama.swift : exclude ggml-metal-embed.metal

* swift : exclude build/
2024-11-08 11:34:06 +02:00
Xuan Son Nguyen 76c6e7f105 server : minor UI fix (#10207) 2024-11-07 18:44:38 -04:00
Xuan Son Nguyen a71d81cf8c server : revamp chat UI with vuejs and daisyui (#10175)
* server : simple chat UI with vuejs and daisyui

* move old files to legacy folder

* embed deps into binary

* basic markdown support

* add conversation history, save to localStorage

* fix bg-base classes

* save theme preferences

* fix tests

* regenerate, edit, copy buttons

* small fixes

* docs: how to use legacy ui

* better error handling

* make CORS preflight more explicit

* add GET method for CORS

* fix tests

* clean up a bit

* better auto scroll

* small fixes

* use collapse-arrow

* fix closeAndSaveConfigDialog

* small fix

* remove console.log

* fix style for <pre> element

* lighter bubble color (less distract when reading)
2024-11-07 17:31:10 -04:00
Georgi Gerganov eec4d71737 scripts : add amx to sync-ggml.sh [no ci] 2024-11-07 23:11:36 +02:00
Georgi Gerganov 3b08828674 sync : ggml 2024-11-07 23:08:24 +02:00
Georgi Gerganov a2c6fd747c scripts : sync update 2024-11-07 23:07:55 +02:00
45 changed files with 29454 additions and 1837 deletions
+10
View File
@@ -24,6 +24,16 @@ insert_final_newline = unset
[examples/server/public/*]
indent_size = 2
[examples/server/public/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[examples/server/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
+4 -13
View File
@@ -1455,22 +1455,13 @@ llama-server: \
examples/server/server.cpp \
examples/server/utils.hpp \
examples/server/httplib.h \
examples/server/colorthemes.css.hpp \
examples/server/style.css.hpp \
examples/server/theme-beeninorder.css.hpp \
examples/server/theme-ketivah.css.hpp \
examples/server/theme-mangotango.css.hpp \
examples/server/theme-playground.css.hpp \
examples/server/theme-polarnight.css.hpp \
examples/server/theme-snowstorm.css.hpp \
examples/server/index.html.hpp \
examples/server/index-new.html.hpp \
examples/server/index.js.hpp \
examples/server/completion.js.hpp \
examples/server/system-prompts.js.hpp \
examples/server/prompt-formats.js.hpp \
examples/server/json-schema-to-grammar.mjs.hpp \
examples/server/loading.html.hpp \
examples/server/deps_daisyui.min.css.hpp \
examples/server/deps_markdown-it.js.hpp \
examples/server/deps_tailwindcss.js.hpp \
examples/server/deps_vue.esm-browser.js.hpp \
common/json.hpp \
common/stb_image.h \
$(OBJ_ALL)
+3 -1
View File
@@ -61,13 +61,15 @@ let package = Package(
name: "llama",
path: ".",
exclude: [
"build",
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"Makefile"
"Makefile",
"ggml/src/ggml-metal-embed.metal"
],
sources: sources,
resources: resources,
+3
View File
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
+4 -13
View File
@@ -15,22 +15,13 @@ set(TARGET_SRCS
httplib.h
)
set(PUBLIC_ASSETS
colorthemes.css
style.css
theme-beeninorder.css
theme-ketivah.css
theme-mangotango.css
theme-playground.css
theme-polarnight.css
theme-snowstorm.css
index.html
index-new.html
index.js
completion.js
system-prompts.js
prompt-formats.js
json-schema-to-grammar.mjs
loading.html
deps_daisyui.min.css
deps_markdown-it.js
deps_tailwindcss.js
deps_vue.esm-browser.js
)
foreach(asset ${PUBLIC_ASSETS})
+10
View File
@@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
}
```
### Legacy completion web UI
A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
For example:
```sh
./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
```
### Extending or building alternative Web Front End
You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+1 -1
View File
@@ -1,7 +1,7 @@
import * as readline from 'node:readline'
import { stdin, stdout } from 'node:process'
import { readFileSync } from 'node:fs'
import { SchemaConverter } from './public/json-schema-to-grammar.mjs'
import { SchemaConverter } from './public_legacy/json-schema-to-grammar.mjs'
const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
+17 -2
View File
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public
echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
echo >> $PUBLIC/index.js # add newline
# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
echo >> $PUBLIC/deps_tailwindcss.js # add newline
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
echo >> $PUBLIC/deps_daisyui.min.css # add newline
curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
echo >> $PUBLIC/deps_markdown-it.js # add newline
ls -lah $PUBLIC
+25 -4
View File
@@ -1,12 +1,16 @@
const paramDefaults = {
stream: true,
n_predict: 500,
temperature: 0.2,
stop: ["</s>"]
};
let generation_settings = null;
export class CompletionError extends Error {
constructor(message, name, data) {
super(message);
this.name = name;
}
};
// Completes the prompt as a generator. Recommended for most use cases.
//
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
const completionParams = { ...paramDefaults, ...params, prompt };
const response = await fetch(`${api_url}/completion`, {
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
method: 'POST',
body: JSON.stringify(completionParams),
headers: {
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
signal: controller.signal,
});
const status = response.status;
if (status !== 200) {
try {
const body = await response.json();
if (body && body.error && body.error.message) {
throw new CompletionError(body.error.message, 'ServerError');
}
} catch (err) {
throw new CompletionError(err.message, 'ServerError');
}
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
for (const line of lines) {
const match = regex.exec(line);
if (match) {
result[match[1]] = match[2]
result[match[1]] = match[2];
if (result.data === '[DONE]') {
cont = false;
break;
}
// since we know this is llama.cpp, let's just decode the json in data
if (result.data) {
result.data = JSON.parse(result.data);
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+209
View File
@@ -0,0 +1,209 @@
const paramDefaults = {
stream: true,
n_predict: 500,
temperature: 0.2,
stop: ["</s>"]
};
let generation_settings = null;
// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;
const api_url = config.api_url?.replace(/\/+$/, '') || "";
if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params, prompt };
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
method: 'POST',
body: JSON.stringify(completionParams),
headers: {
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Accept': 'text/event-stream',
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
},
signal: controller.signal,
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let content = "";
let leftover = ""; // Buffer for partially read lines
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
break;
}
// Add any leftover data to the current chunk of data
const text = leftover + decoder.decode(result.value);
// Check if the last character is a line break
const endsWithLineBreak = text.endsWith('\n');
// Split the text into lines
let lines = text.split('\n');
// If the text doesn't end with a line break, then the last line is incomplete
// Store it in leftover to be added to the next chunk of data
if (!endsWithLineBreak) {
leftover = lines.pop();
} else {
leftover = ""; // Reset leftover if we have a line break at the end
}
// Parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const line of lines) {
const match = regex.exec(line);
if (match) {
result[match[1]] = match[2];
if (result.data === '[DONE]') {
cont = false;
break;
}
// since we know this is llama.cpp, let's just decode the json in data
if (result.data) {
result.data = JSON.parse(result.data);
content += result.data.content;
// yield
yield result;
// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
cont = false;
break;
}
}
if (result.error) {
try {
result.error = JSON.parse(result.error);
if (result.error.message.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
}
} catch(e) {
console.error(`llama.cpp error ${result.error}`)
}
}
}
}
}
} catch (e) {
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
controller.abort();
}
return content;
}
// Call llama, return an event target that you can subscribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}
// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};
/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}
// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async (config = {}) => {
if (!generation_settings) {
const api_url = config.api_url?.replace(/\/+$/, '') || "";
const props = await fetch(`${api_url}/props`).then(r => r.json());
generation_settings = props.default_generation_settings;
}
return generation_settings;
}

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

File diff suppressed because it is too large Load Diff
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="refresh" content="5">
</head>
<body>
<div id="loading">
The model is loading. Please wait.<br/>
The user interface will appear soon.
</div>
</body>
</html>
+23 -48
View File
@@ -14,22 +14,13 @@
#define MIMETYPE_JSON "application/json; charset=utf-8"
// auto generated files (update with ./deps.sh)
#include "colorthemes.css.hpp"
#include "style.css.hpp"
#include "theme-beeninorder.css.hpp"
#include "theme-ketivah.css.hpp"
#include "theme-mangotango.css.hpp"
#include "theme-playground.css.hpp"
#include "theme-polarnight.css.hpp"
#include "theme-snowstorm.css.hpp"
#include "index.html.hpp"
#include "index-new.html.hpp"
#include "index.js.hpp"
#include "completion.js.hpp"
#include "system-prompts.js.hpp"
#include "prompt-formats.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
#include "loading.html.hpp"
#include "deps_daisyui.min.css.hpp"
#include "deps_markdown-it.js.hpp"
#include "deps_tailwindcss.js.hpp"
#include "deps_vue.esm-browser.js.hpp"
#include <atomic>
#include <condition_variable>
@@ -2285,16 +2276,6 @@ int main(int argc, char ** argv) {
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
svr->set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
// Access-Control-Allow-Origin is already set by middleware
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*");
return res.set_content("", "text/html"); // blank response, no data
});
svr->set_logger(log_server_request);
auto res_error = [](httplib::Response & res, const json & error_data) {
@@ -2407,6 +2388,14 @@ int main(int argc, char ** argv) {
// register server middlewares
svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
// If this is OPTIONS request, skip validation because browsers don't include Authorization header
if (req.method == "OPTIONS") {
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "GET, POST");
res.set_header("Access-Control-Allow-Headers", "*");
res.set_content("", "text/html"); // blank response, no data
return httplib::Server::HandlerResponse::Handled; // skip further processing
}
if (!middleware_server_state(req, res)) {
return httplib::Server::HandlerResponse::Handled;
}
@@ -3116,33 +3105,19 @@ int main(int argc, char ** argv) {
// register static assets routes
if (!params.public_path.empty()) {
// Set the base directory for serving static files
svr->set_base_dir(params.public_path);
}
if (!params.api_keys.empty()) {
// for now, if API key is set, web UI is unusable
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
});
bool is_found = svr->set_mount_point("/", params.public_path);
if (!is_found) {
LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
return 1;
}
} else {
// using embedded static files
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
// add new-ui files
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
}
// register API routes
@@ -64,5 +64,5 @@ Feature: Security
| localhost | Access-Control-Allow-Origin | localhost |
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
| origin | Access-Control-Allow-Credentials | true |
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
| web.mydomain.fr | Access-Control-Allow-Methods | GET, POST |
| web.mydomain.fr | Access-Control-Allow-Headers | * |
+3
View File
@@ -1746,6 +1746,9 @@ extern "C" {
struct ggml_tensor * a,
enum ggml_prec prec);
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
const struct ggml_tensor * a);
// TODO: needs to be adapted to ggml_flash_attn_ext
GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
+3
View File
@@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
#ifndef FLASH_ATTN_AVAILABLE
return false;
#endif
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
return false;
}
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
return true;
}
+5 -5
View File
@@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[3];
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
if (precision != GGML_PREC_DEFAULT) {
if (prec != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
constexpr int cols_per_block = 16;
switch (Q->ne[0]) {
@@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int32_t precision = KQV->op_params[3];
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
// On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) {
if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
} else {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
@@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
}
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
if (precision == GGML_PREC_DEFAULT) {
if (prec == GGML_PREC_DEFAULT) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
return;
} else if(Q->ne[0] <= 128) {
+56 -18
View File
@@ -269,6 +269,12 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
@@ -300,12 +306,14 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
@@ -585,6 +593,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \
[metal_function release]; \
if (error) { \
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
@@ -777,6 +788,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, flash_attn_ext_bf16_h112, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, flash_attn_ext_bf16_h128, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm);
@@ -808,12 +825,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, flash_attn_ext_vec_bf16_h256, has_simdgroup_reduction && has_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
@@ -1111,7 +1130,7 @@ static void ggml_metal_encode_node(
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
const int64_t ne0 = dst ? dst->ne[0] : 0;
const int64_t ne1 = dst ? dst->ne[1] : 0;
@@ -3033,6 +3052,23 @@ static void ggml_metal_encode_node(
}
}
} break;
case GGML_TYPE_BF16:
{
switch (ne00) {
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80 ].pipeline; break;
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96 ].pipeline; break;
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112].pipeline; break;
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128].pipeline; break;
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256].pipeline; break;
default:
{
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_LOG_ERROR("add template specialization for this size\n");
GGML_ABORT("add template specialization for this size");
}
}
} break;
case GGML_TYPE_Q4_0:
{
switch (ne00) {
@@ -3133,6 +3169,7 @@ static void ggml_metal_encode_node(
{
switch (src1->type) {
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128].pipeline; break;
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break;
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break;
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break;
@@ -3150,6 +3187,7 @@ static void ggml_metal_encode_node(
{
switch (src1->type) {
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256].pipeline; break;
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break;
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break;
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break;
@@ -3194,18 +3232,15 @@ static void ggml_metal_encode_node(
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:18];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:19];
[encoder setBytes:&scale length:sizeof( float) atIndex:20];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:21];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:22];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:23];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:24];
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25];
if (!use_vec_kernel) {
// half8x8 kernel
@@ -3216,11 +3251,14 @@ static void ggml_metal_encode_node(
GGML_ASSERT(nqptg % 8 == 0);
GGML_ASSERT(ncpsg % 32 == 0);
// 2*(2*ncpsg + nqptg)*(nsg)
// ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
//
// 16*32*(nsg)
// the shared memory needed for the simdgroups to load the KV cache
// each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
//
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
int64_t nsgmax = 2;
@@ -3254,12 +3292,12 @@ static void ggml_metal_encode_node(
// ne00 + 2*ncpsg*(nsg)
// for each query, we load it as f16 in shared memory (ne00)
// and store the attention scores (nqptg x ncpsg) as f32
// and store the soft_max values and the mask
//
// 2*ne00*(nsg)
// each simdgroup has a full f32 head vector in shared mem to accumulate results
// ne00*(nsg)
// each simdgroup has a full f16 head vector in shared mem to accumulate results
//
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16))
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + ne00*(nsg))*(sizeof(float)/2), 16))
int64_t nsgmax = 2;
+455 -338
View File
File diff suppressed because it is too large Load Diff
+9
View File
@@ -4228,6 +4228,15 @@ void ggml_flash_attn_ext_set_prec(
ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
}
enum ggml_prec ggml_flash_attn_ext_get_prec(
const struct ggml_tensor * a) {
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
return (enum ggml_prec) prec_i32;
}
// ggml_flash_attn_back
struct ggml_tensor * ggml_flash_attn_back(
+1 -1
View File
@@ -124,7 +124,7 @@ You can use GBNF grammars:
- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
- To convert to a grammar ahead of time:
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+20 -68
View File
@@ -114,46 +114,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# replace filenames:
#
# CMakelists.txt -> ggml/CMakeLists.txt
# src/CMakeLists.txt -> ggml/src/CMakeLists.txt
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
# CMakelists.txt -> ggml/CMakeLists.txt
# src/CMakeLists.txt -> ggml/src/CMakeLists.txt
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
#
# src/ggml.c -> ggml/src/ggml.c
# src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
# src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h
# src/ggml-alloc.c -> ggml/src/ggml-alloc.c
# src/ggml-amx/* -> ggml/src/ggml-amx/
# src/ggml-amx.cpp -> ggml/src/ggml-amx.cpp
# src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
# src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp
# src/ggml-cann/* -> ggml/src/ggml-cann/
# src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp
# src/ggml-common.h -> ggml/src/ggml-common.h
# src/ggml-cuda/* -> ggml/src/ggml-cuda/
# src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu
# src/ggml-impl.h -> ggml/src/ggml-impl.h
# src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp
# src/ggml-metal.m -> ggml/src/ggml-metal.m
# src/ggml-quants.c -> ggml/src/ggml-quants.c
# src/ggml-quants.h -> ggml/src/ggml-quants.h
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
# src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp
# src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
# src/ggml*.c -> ggml/src/ggml*.c
# src/ggml*.cpp -> ggml/src/ggml*.cpp
# src/ggml*.h -> ggml/src/ggml*.h
# src/ggml*.cu -> ggml/src/ggml*.cu
# src/ggml*.m -> ggml/src/ggml*.m
# src/ggml-amx/* -> ggml/src/ggml-amx/
# src/ggml-cann/* -> ggml/src/ggml-cann/
# src/ggml-cuda/* -> ggml/src/ggml-cuda/
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
#
# include/ggml.h -> ggml/include/ggml.h
# include/ggml-alloc.h -> ggml/include/ggml-alloc.h
# include/ggml-amx.h -> ggml/include/ggml-amx.h
# include/ggml-backend.h -> ggml/include/ggml-backend.h
# include/ggml-blas.h -> ggml/include/ggml-blas.h
# include/ggml-cann.h -> ggml/include/ggml-cann.h
# include/ggml-cuda.h -> ggml/include/ggml-cuda.h
# include/ggml-kompute.h -> ggml/include/ggml-kompute.h
# include/ggml-metal.h -> ggml/include/ggml-metal.h
# include/ggml-rpc.h -> ggml/include/ggml-rpc.h
# include/ggml-sycl.h -> ggml/include/ggml-sycl.h
# include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h
# include/ggml*.h -> ggml/include/ggml*.h
#
# tests/test-opt.cpp -> tests/test-opt.cpp
# tests/test-grad0.cpp -> tests/test-grad0.cpp
@@ -168,41 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
-e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
-e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
-e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
-e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
+1 -1
View File
@@ -1 +1 @@
a099cb514d6687e436a5a423d1fb0448be0feb20
89952d649e0c5cabbb9ff8c4906f5a843a789fb2
+11 -36
View File
@@ -4,43 +4,18 @@ cp -rpv ../ggml/CMakeLists.txt ./ggml/CMakeLists.txt
cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt
cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c
cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c
cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h
cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c
cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/
cp -rpv ../ggml/src/ggml-amx.cpp ./ggml/src/ggml-amx.cpp
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp
cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu
cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp
cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m
cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
cp -rpv ../ggml/src/ggml*.c ./ggml/src/
cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/
cp -rpv ../ggml/src/ggml*.h ./ggml/src/
cp -rpv ../ggml/src/ggml*.cu ./ggml/src/
cp -rpv ../ggml/src/ggml*.m ./ggml/src/
cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h
cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h
cp -rpv ../ggml/include/ggml-amx.h ./ggml/include/ggml-amx.h
cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h
cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h
cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h
cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h
cp -rpv ../ggml/include/ggml*.h ./ggml/include/
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
+1 -1
View File
@@ -1,5 +1,5 @@
import { readFileSync } from "fs"
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
const [, , file] = process.argv
const url = `file://${file}`
+1 -1
View File
@@ -3745,7 +3745,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 3, 32, 35, }) {
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
}
}