mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-29 17:17:40 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 695ad752b2 | |||
| 841f27abdb | |||
| d05b3127bd | |||
| 76c6e7f105 | |||
| a71d81cf8c | |||
| eec4d71737 | |||
| 3b08828674 | |||
| a2c6fd747c |
@@ -24,6 +24,16 @@ insert_final_newline = unset
|
||||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
|
||||
@@ -1455,22 +1455,13 @@ llama-server: \
|
||||
examples/server/server.cpp \
|
||||
examples/server/utils.hpp \
|
||||
examples/server/httplib.h \
|
||||
examples/server/colorthemes.css.hpp \
|
||||
examples/server/style.css.hpp \
|
||||
examples/server/theme-beeninorder.css.hpp \
|
||||
examples/server/theme-ketivah.css.hpp \
|
||||
examples/server/theme-mangotango.css.hpp \
|
||||
examples/server/theme-playground.css.hpp \
|
||||
examples/server/theme-polarnight.css.hpp \
|
||||
examples/server/theme-snowstorm.css.hpp \
|
||||
examples/server/index.html.hpp \
|
||||
examples/server/index-new.html.hpp \
|
||||
examples/server/index.js.hpp \
|
||||
examples/server/completion.js.hpp \
|
||||
examples/server/system-prompts.js.hpp \
|
||||
examples/server/prompt-formats.js.hpp \
|
||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
examples/server/deps_daisyui.min.css.hpp \
|
||||
examples/server/deps_markdown-it.js.hpp \
|
||||
examples/server/deps_tailwindcss.js.hpp \
|
||||
examples/server/deps_vue.esm-browser.js.hpp \
|
||||
common/json.hpp \
|
||||
common/stb_image.h \
|
||||
$(OBJ_ALL)
|
||||
|
||||
+3
-1
@@ -61,13 +61,15 @@ let package = Package(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"build",
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile"
|
||||
"Makefile",
|
||||
"ggml/src/ggml-metal-embed.metal"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
|
||||
@@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "bf16") {
|
||||
return GGML_TYPE_BF16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
|
||||
@@ -15,22 +15,13 @@ set(TARGET_SRCS
|
||||
httplib.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
colorthemes.css
|
||||
style.css
|
||||
theme-beeninorder.css
|
||||
theme-ketivah.css
|
||||
theme-mangotango.css
|
||||
theme-playground.css
|
||||
theme-polarnight.css
|
||||
theme-snowstorm.css
|
||||
index.html
|
||||
index-new.html
|
||||
index.js
|
||||
completion.js
|
||||
system-prompts.js
|
||||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
loading.html
|
||||
deps_daisyui.min.css
|
||||
deps_markdown-it.js
|
||||
deps_tailwindcss.js
|
||||
deps_vue.esm-browser.js
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
|
||||
@@ -928,6 +928,16 @@ Apart from error types supported by OAI, we also have custom types that are spec
|
||||
}
|
||||
```
|
||||
|
||||
### Legacy completion web UI
|
||||
|
||||
A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
|
||||
|
||||
For example:
|
||||
|
||||
```sh
|
||||
./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
|
||||
```
|
||||
|
||||
### Extending or building alternative Web Front End
|
||||
|
||||
You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import * as readline from 'node:readline'
|
||||
import { stdin, stdout } from 'node:process'
|
||||
import { readFileSync } from 'node:fs'
|
||||
import { SchemaConverter } from './public/json-schema-to-grammar.mjs'
|
||||
import { SchemaConverter } from './public_legacy/json-schema-to-grammar.mjs'
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const grammarJsonSchemaFile = args.find(
|
||||
|
||||
+17
-2
@@ -6,5 +6,20 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
PUBLIC=$DIR/public
|
||||
|
||||
echo "download js bundle files"
|
||||
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
|
||||
echo >> $PUBLIC/index.js # add newline
|
||||
|
||||
# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
|
||||
|
||||
curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
|
||||
echo >> $PUBLIC/deps_tailwindcss.js # add newline
|
||||
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
|
||||
echo >> $PUBLIC/deps_daisyui.min.css # add newline
|
||||
|
||||
curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
|
||||
echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
|
||||
|
||||
curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
|
||||
echo >> $PUBLIC/deps_markdown-it.js # add newline
|
||||
|
||||
ls -lah $PUBLIC
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
const paramDefaults = {
|
||||
stream: true,
|
||||
n_predict: 500,
|
||||
temperature: 0.2,
|
||||
stop: ["</s>"]
|
||||
};
|
||||
|
||||
let generation_settings = null;
|
||||
|
||||
export class CompletionError extends Error {
|
||||
constructor(message, name, data) {
|
||||
super(message);
|
||||
this.name = name;
|
||||
}
|
||||
};
|
||||
|
||||
// Completes the prompt as a generator. Recommended for most use cases.
|
||||
//
|
||||
@@ -29,7 +33,7 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
|
||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||
|
||||
const response = await fetch(`${api_url}/completion`, {
|
||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(completionParams),
|
||||
headers: {
|
||||
@@ -41,6 +45,18 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const status = response.status;
|
||||
if (status !== 200) {
|
||||
try {
|
||||
const body = await response.json();
|
||||
if (body && body.error && body.error.message) {
|
||||
throw new CompletionError(body.error.message, 'ServerError');
|
||||
}
|
||||
} catch (err) {
|
||||
throw new CompletionError(err.message, 'ServerError');
|
||||
}
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
@@ -78,7 +94,12 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2]
|
||||
result[match[1]] = match[2];
|
||||
if (result.data === '[DONE]') {
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
|
||||
+13
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
+566
-1285
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,209 @@
|
||||
const paramDefaults = {
|
||||
stream: true,
|
||||
n_predict: 500,
|
||||
temperature: 0.2,
|
||||
stop: ["</s>"]
|
||||
};
|
||||
|
||||
let generation_settings = null;
|
||||
|
||||
|
||||
// Completes the prompt as a generator. Recommended for most use cases.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llama } from '/completion.js'
|
||||
//
|
||||
// const request = llama("Tell me a joke", {n_predict: 800})
|
||||
// for await (const chunk of request) {
|
||||
// document.write(chunk.data.content)
|
||||
// }
|
||||
//
|
||||
export async function* llama(prompt, params = {}, config = {}) {
|
||||
let controller = config.controller;
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
|
||||
if (!controller) {
|
||||
controller = new AbortController();
|
||||
}
|
||||
|
||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||
|
||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(completionParams),
|
||||
headers: {
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'text/event-stream',
|
||||
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
let content = "";
|
||||
let leftover = ""; // Buffer for partially read lines
|
||||
|
||||
try {
|
||||
let cont = true;
|
||||
|
||||
while (cont) {
|
||||
const result = await reader.read();
|
||||
if (result.done) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Add any leftover data to the current chunk of data
|
||||
const text = leftover + decoder.decode(result.value);
|
||||
|
||||
// Check if the last character is a line break
|
||||
const endsWithLineBreak = text.endsWith('\n');
|
||||
|
||||
// Split the text into lines
|
||||
let lines = text.split('\n');
|
||||
|
||||
// If the text doesn't end with a line break, then the last line is incomplete
|
||||
// Store it in leftover to be added to the next chunk of data
|
||||
if (!endsWithLineBreak) {
|
||||
leftover = lines.pop();
|
||||
} else {
|
||||
leftover = ""; // Reset leftover if we have a line break at the end
|
||||
}
|
||||
|
||||
// Parse all sse events and add them to result
|
||||
const regex = /^(\S+):\s(.*)$/gm;
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2];
|
||||
if (result.data === '[DONE]') {
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
content += result.data.content;
|
||||
|
||||
// yield
|
||||
yield result;
|
||||
|
||||
// if we got a stop token from server, we will break here
|
||||
if (result.data.stop) {
|
||||
if (result.data.generation_settings) {
|
||||
generation_settings = result.data.generation_settings;
|
||||
}
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (result.error) {
|
||||
try {
|
||||
result.error = JSON.parse(result.error);
|
||||
if (result.error.message.includes('slot unavailable')) {
|
||||
// Throw an error to be caught by upstream callers
|
||||
throw new Error('slot unavailable');
|
||||
} else {
|
||||
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
|
||||
}
|
||||
} catch(e) {
|
||||
console.error(`llama.cpp error ${result.error}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
if (e.name !== 'AbortError') {
|
||||
console.error("llama error: ", e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
finally {
|
||||
controller.abort();
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
// Call llama, return an event target that you can subscribe to
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llamaEventTarget } from '/completion.js'
|
||||
//
|
||||
// const conn = llamaEventTarget(prompt)
|
||||
// conn.addEventListener("message", (chunk) => {
|
||||
// document.write(chunk.detail.content)
|
||||
// })
|
||||
//
|
||||
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
|
||||
const eventTarget = new EventTarget();
|
||||
(async () => {
|
||||
let content = "";
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
if (chunk.data) {
|
||||
content += chunk.data.content;
|
||||
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
|
||||
}
|
||||
if (chunk.data.generation_settings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
|
||||
}
|
||||
if (chunk.data.timings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
|
||||
}
|
||||
}
|
||||
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
|
||||
})();
|
||||
return eventTarget;
|
||||
}
|
||||
|
||||
// Call llama, return a promise that resolves to the completed text. This does not support streaming
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// llamaPromise(prompt).then((content) => {
|
||||
// document.write(content)
|
||||
// })
|
||||
//
|
||||
// or
|
||||
//
|
||||
// const content = await llamaPromise(prompt)
|
||||
// document.write(content)
|
||||
//
|
||||
export const llamaPromise = (prompt, params = {}, config = {}) => {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
let content = "";
|
||||
try {
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
content += chunk.data.content;
|
||||
}
|
||||
resolve(content);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* (deprecated)
|
||||
*/
|
||||
export const llamaComplete = async (params, controller, callback) => {
|
||||
for await (const chunk of llama(params.prompt, params, { controller })) {
|
||||
callback(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||
export const llamaModelInfo = async (config = {}) => {
|
||||
if (!generation_settings) {
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
||||
generation_settings = props.default_generation_settings;
|
||||
}
|
||||
return generation_settings;
|
||||
}
|
||||
|
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.0 KiB |
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="5">
|
||||
</head>
|
||||
<body>
|
||||
<div id="loading">
|
||||
The model is loading. Please wait.<br/>
|
||||
The user interface will appear soon.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
+23
-48
@@ -14,22 +14,13 @@
|
||||
#define MIMETYPE_JSON "application/json; charset=utf-8"
|
||||
|
||||
// auto generated files (update with ./deps.sh)
|
||||
#include "colorthemes.css.hpp"
|
||||
#include "style.css.hpp"
|
||||
#include "theme-beeninorder.css.hpp"
|
||||
#include "theme-ketivah.css.hpp"
|
||||
#include "theme-mangotango.css.hpp"
|
||||
#include "theme-playground.css.hpp"
|
||||
#include "theme-polarnight.css.hpp"
|
||||
#include "theme-snowstorm.css.hpp"
|
||||
#include "index.html.hpp"
|
||||
#include "index-new.html.hpp"
|
||||
#include "index.js.hpp"
|
||||
#include "completion.js.hpp"
|
||||
#include "system-prompts.js.hpp"
|
||||
#include "prompt-formats.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
#include "loading.html.hpp"
|
||||
#include "deps_daisyui.min.css.hpp"
|
||||
#include "deps_markdown-it.js.hpp"
|
||||
#include "deps_tailwindcss.js.hpp"
|
||||
#include "deps_vue.esm-browser.js.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
@@ -2285,16 +2276,6 @@ int main(int argc, char ** argv) {
|
||||
std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
|
||||
|
||||
svr->set_default_headers({{"Server", "llama.cpp"}});
|
||||
|
||||
// CORS preflight
|
||||
svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
|
||||
// Access-Control-Allow-Origin is already set by middleware
|
||||
res.set_header("Access-Control-Allow-Credentials", "true");
|
||||
res.set_header("Access-Control-Allow-Methods", "POST");
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
return res.set_content("", "text/html"); // blank response, no data
|
||||
});
|
||||
|
||||
svr->set_logger(log_server_request);
|
||||
|
||||
auto res_error = [](httplib::Response & res, const json & error_data) {
|
||||
@@ -2407,6 +2388,14 @@ int main(int argc, char ** argv) {
|
||||
// register server middlewares
|
||||
svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
// If this is OPTIONS request, skip validation because browsers don't include Authorization header
|
||||
if (req.method == "OPTIONS") {
|
||||
res.set_header("Access-Control-Allow-Credentials", "true");
|
||||
res.set_header("Access-Control-Allow-Methods", "GET, POST");
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
res.set_content("", "text/html"); // blank response, no data
|
||||
return httplib::Server::HandlerResponse::Handled; // skip further processing
|
||||
}
|
||||
if (!middleware_server_state(req, res)) {
|
||||
return httplib::Server::HandlerResponse::Handled;
|
||||
}
|
||||
@@ -3116,33 +3105,19 @@ int main(int argc, char ** argv) {
|
||||
// register static assets routes
|
||||
if (!params.public_path.empty()) {
|
||||
// Set the base directory for serving static files
|
||||
svr->set_base_dir(params.public_path);
|
||||
}
|
||||
|
||||
if (!params.api_keys.empty()) {
|
||||
// for now, if API key is set, web UI is unusable
|
||||
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
|
||||
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
|
||||
});
|
||||
bool is_found = svr->set_mount_point("/", params.public_path);
|
||||
if (!is_found) {
|
||||
LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
|
||||
// add new-ui files
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
|
||||
}
|
||||
|
||||
// register API routes
|
||||
|
||||
@@ -64,5 +64,5 @@ Feature: Security
|
||||
| localhost | Access-Control-Allow-Origin | localhost |
|
||||
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
|
||||
| origin | Access-Control-Allow-Credentials | true |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Methods | GET, POST |
|
||||
| web.mydomain.fr | Access-Control-Allow-Headers | * |
|
||||
|
||||
@@ -1746,6 +1746,9 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
||||
const struct ggml_tensor * a);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -3159,6 +3159,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
#ifndef FLASH_ATTN_AVAILABLE
|
||||
return false;
|
||||
#endif
|
||||
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -13,9 +13,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
if (precision != GGML_PREC_DEFAULT) {
|
||||
if (prec != GGML_PREC_DEFAULT) {
|
||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||
constexpr int cols_per_block = 16;
|
||||
switch (Q->ne[0]) {
|
||||
@@ -301,11 +301,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= CC_OFFSET_AMD) {
|
||||
if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
@@ -332,7 +332,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||
if (precision == GGML_PREC_DEFAULT) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
return;
|
||||
} else if(Q->ne[0] <= 128) {
|
||||
|
||||
+56
-18
@@ -269,6 +269,12 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
|
||||
@@ -300,12 +306,14 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
|
||||
@@ -585,6 +593,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
|
||||
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
|
||||
kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
|
||||
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
|
||||
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
||||
(int) kernel->pipeline.threadExecutionWidth); \
|
||||
[metal_function release]; \
|
||||
if (error) { \
|
||||
GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
@@ -777,6 +788,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64, flash_attn_ext_bf16_h64, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80, flash_attn_ext_bf16_h80, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96, flash_attn_ext_bf16_h96, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112, flash_attn_ext_bf16_h112, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128, flash_attn_ext_bf16_h128, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256, flash_attn_ext_bf16_h256, has_simdgroup_mm && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm);
|
||||
@@ -808,12 +825,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128, flash_attn_ext_vec_bf16_h128, has_simdgroup_reduction && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256, flash_attn_ext_vec_bf16_h256, has_simdgroup_reduction && has_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
|
||||
@@ -1111,7 +1130,7 @@ static void ggml_metal_encode_node(
|
||||
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
||||
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
||||
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
||||
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
||||
const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
|
||||
|
||||
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
||||
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
||||
@@ -3033,6 +3052,23 @@ static void ggml_metal_encode_node(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
|
||||
case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80 ].pipeline; break;
|
||||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96 ].pipeline; break;
|
||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112].pipeline; break;
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
switch (ne00) {
|
||||
@@ -3133,6 +3169,7 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break;
|
||||
@@ -3150,6 +3187,7 @@ static void ggml_metal_encode_node(
|
||||
{
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break;
|
||||
@@ -3194,18 +3232,15 @@ static void ggml_metal_encode_node(
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:18];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:19];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:20];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:21];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:22];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:23];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:24];
|
||||
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25];
|
||||
|
||||
if (!use_vec_kernel) {
|
||||
// half8x8 kernel
|
||||
@@ -3216,11 +3251,14 @@ static void ggml_metal_encode_node(
|
||||
GGML_ASSERT(nqptg % 8 == 0);
|
||||
GGML_ASSERT(ncpsg % 32 == 0);
|
||||
|
||||
// 2*(2*ncpsg + nqptg)*(nsg)
|
||||
// ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
|
||||
//
|
||||
// 16*32*(nsg)
|
||||
// the shared memory needed for the simdgroups to load the KV cache
|
||||
// each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
|
||||
@@ -3254,12 +3292,12 @@ static void ggml_metal_encode_node(
|
||||
|
||||
// ne00 + 2*ncpsg*(nsg)
|
||||
// for each query, we load it as f16 in shared memory (ne00)
|
||||
// and store the attention scores (nqptg x ncpsg) as f32
|
||||
// and store the soft_max values and the mask
|
||||
//
|
||||
// 2*ne00*(nsg)
|
||||
// each simdgroup has a full f32 head vector in shared mem to accumulate results
|
||||
// ne00*(nsg)
|
||||
// each simdgroup has a full f16 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16))
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + ne00*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
|
||||
|
||||
+455
-338
File diff suppressed because it is too large
Load Diff
@@ -4228,6 +4228,15 @@ void ggml_flash_attn_ext_set_prec(
|
||||
ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
|
||||
}
|
||||
|
||||
enum ggml_prec ggml_flash_attn_ext_get_prec(
|
||||
const struct ggml_tensor * a) {
|
||||
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
|
||||
const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
|
||||
|
||||
return (enum ggml_prec) prec_i32;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_back
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_back(
|
||||
|
||||
+1
-1
@@ -124,7 +124,7 @@ You can use GBNF grammars:
|
||||
- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
|
||||
- To convert to a grammar ahead of time:
|
||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
|
||||
|
||||
+20
-68
@@ -114,46 +114,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
|
||||
# replace filenames:
|
||||
#
|
||||
# CMakelists.txt -> ggml/CMakeLists.txt
|
||||
# src/CMakeLists.txt -> ggml/src/CMakeLists.txt
|
||||
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
|
||||
# CMakelists.txt -> ggml/CMakeLists.txt
|
||||
# src/CMakeLists.txt -> ggml/src/CMakeLists.txt
|
||||
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
|
||||
#
|
||||
# src/ggml.c -> ggml/src/ggml.c
|
||||
# src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
|
||||
# src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h
|
||||
# src/ggml-alloc.c -> ggml/src/ggml-alloc.c
|
||||
# src/ggml-amx/* -> ggml/src/ggml-amx/
|
||||
# src/ggml-amx.cpp -> ggml/src/ggml-amx.cpp
|
||||
# src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
|
||||
# src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp
|
||||
# src/ggml-cann/* -> ggml/src/ggml-cann/
|
||||
# src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp
|
||||
# src/ggml-common.h -> ggml/src/ggml-common.h
|
||||
# src/ggml-cuda/* -> ggml/src/ggml-cuda/
|
||||
# src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu
|
||||
# src/ggml-impl.h -> ggml/src/ggml-impl.h
|
||||
# src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp
|
||||
# src/ggml-metal.m -> ggml/src/ggml-metal.m
|
||||
# src/ggml-quants.c -> ggml/src/ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml/src/ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
|
||||
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
|
||||
# src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp
|
||||
# src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp
|
||||
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
|
||||
# src/ggml*.c -> ggml/src/ggml*.c
|
||||
# src/ggml*.cpp -> ggml/src/ggml*.cpp
|
||||
# src/ggml*.h -> ggml/src/ggml*.h
|
||||
# src/ggml*.cu -> ggml/src/ggml*.cu
|
||||
# src/ggml*.m -> ggml/src/ggml*.m
|
||||
# src/ggml-amx/* -> ggml/src/ggml-amx/
|
||||
# src/ggml-cann/* -> ggml/src/ggml-cann/
|
||||
# src/ggml-cuda/* -> ggml/src/ggml-cuda/
|
||||
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
|
||||
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
|
||||
#
|
||||
# include/ggml.h -> ggml/include/ggml.h
|
||||
# include/ggml-alloc.h -> ggml/include/ggml-alloc.h
|
||||
# include/ggml-amx.h -> ggml/include/ggml-amx.h
|
||||
# include/ggml-backend.h -> ggml/include/ggml-backend.h
|
||||
# include/ggml-blas.h -> ggml/include/ggml-blas.h
|
||||
# include/ggml-cann.h -> ggml/include/ggml-cann.h
|
||||
# include/ggml-cuda.h -> ggml/include/ggml-cuda.h
|
||||
# include/ggml-kompute.h -> ggml/include/ggml-kompute.h
|
||||
# include/ggml-metal.h -> ggml/include/ggml-metal.h
|
||||
# include/ggml-rpc.h -> ggml/include/ggml-rpc.h
|
||||
# include/ggml-sycl.h -> ggml/include/ggml-sycl.h
|
||||
# include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h
|
||||
# include/ggml*.h -> ggml/include/ggml*.h
|
||||
#
|
||||
# tests/test-opt.cpp -> tests/test-opt.cpp
|
||||
# tests/test-grad0.cpp -> tests/test-grad0.cpp
|
||||
@@ -168,41 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
|
||||
|
||||
@@ -1 +1 @@
|
||||
a099cb514d6687e436a5a423d1fb0448be0feb20
|
||||
89952d649e0c5cabbb9ff8c4906f5a843a789fb2
|
||||
|
||||
+11
-36
@@ -4,43 +4,18 @@ cp -rpv ../ggml/CMakeLists.txt ./ggml/CMakeLists.txt
|
||||
cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt
|
||||
cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
|
||||
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c
|
||||
cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c
|
||||
cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h
|
||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c
|
||||
cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/
|
||||
cp -rpv ../ggml/src/ggml-amx.cpp ./ggml/src/ggml-amx.cpp
|
||||
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
|
||||
cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp
|
||||
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
|
||||
cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp
|
||||
cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h
|
||||
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu
|
||||
cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h
|
||||
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp
|
||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m
|
||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp
|
||||
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
|
||||
cp -rpv ../ggml/src/ggml*.c ./ggml/src/
|
||||
cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/
|
||||
cp -rpv ../ggml/src/ggml*.h ./ggml/src/
|
||||
cp -rpv ../ggml/src/ggml*.cu ./ggml/src/
|
||||
cp -rpv ../ggml/src/ggml*.m ./ggml/src/
|
||||
cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/
|
||||
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
|
||||
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
|
||||
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
|
||||
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
|
||||
|
||||
cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h
|
||||
cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h
|
||||
cp -rpv ../ggml/include/ggml-amx.h ./ggml/include/ggml-amx.h
|
||||
cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
|
||||
cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h
|
||||
cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h
|
||||
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
|
||||
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
|
||||
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
|
||||
cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h
|
||||
cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h
|
||||
cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h
|
||||
cp -rpv ../ggml/include/ggml*.h ./ggml/include/
|
||||
|
||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { readFileSync } from "fs"
|
||||
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
|
||||
import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
|
||||
|
||||
const [, , file] = process.argv
|
||||
const url = `file://${file}`
|
||||
|
||||
@@ -3745,7 +3745,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 3, 32, 35, }) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user